diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def --- a/clang/include/clang/Basic/LangOptions.def +++ b/clang/include/clang/Basic/LangOptions.def @@ -236,7 +236,6 @@ LANGOPT(OpenMPCUDABlocksPerSM , 32, 0, "Number of blocks per SM for CUDA devices.") LANGOPT(OpenMPCUDAReductionBufNum , 32, 1024, "Number of the reduction records in the intermediate reduction buffer used for the teams reductions.") LANGOPT(OpenMPOptimisticCollapse , 1, 0, "Use at most 32 bits to represent the collapsed loop nest counter.") -LANGOPT(OpenMPCUDATargetParallel, 1, 0, "Support parallel execution of target region on Cuda-based devices.") LANGOPT(RenderScript , 1, 0, "RenderScript") LANGOPT(CUDAIsDevice , 1, 0, "compiling for CUDA device") diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -2311,12 +2311,6 @@ defm openmp_optimistic_collapse : BoolFOption<"openmp-optimistic-collapse", LangOpts<"OpenMPOptimisticCollapse">, DefaultFalse, PosFlag, NegFlag, BothFlags<[NoArgumentUnused, HelpHidden]>>; -def fopenmp_cuda_parallel_target_regions : Flag<["-"], "fopenmp-cuda-parallel-target-regions">, Group, - Flags<[CC1Option, NoArgumentUnused, HelpHidden]>, - HelpText<"Support parallel execution of target regions on Cuda-based devices.">; -def fno_openmp_cuda_parallel_target_regions : Flag<["-"], "fno-openmp-cuda-parallel-target-regions">, Group, - Flags<[NoArgumentUnused, HelpHidden]>, - HelpText<"Support only serial execution of target regions on Cuda-based devices.">; def static_openmp: Flag<["-"], "static-openmp">, HelpText<"Use the static host OpenMP runtime while linking.">; def fno_optimize_sibling_calls : Flag<["-"], "fno-optimize-sibling-calls">, Group; diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h @@ -439,16 +439,11 @@ /// The data for the single globalized variable. struct MappedVarData { + MappedVarData() = default; /// Corresponding field in the global record. - const FieldDecl *FD = nullptr; + llvm::Value *GlobalizedVal = nullptr; /// Corresponding address. Address PrivateAddr = Address::invalid(); - /// true, if only one element is required (for latprivates in SPMD mode), - /// false, if need to create based on the warp-size. - bool IsOnePerTeam = false; - MappedVarData() = delete; - MappedVarData(const FieldDecl *FD, bool IsOnePerTeam = false) - : FD(FD), IsOnePerTeam(IsOnePerTeam) {} }; /// The map of local variables to their addresses in the global memory. using DeclToAddrMapTy = llvm::MapVector; @@ -460,29 +455,12 @@ EscapedParamsTy EscapedParameters; llvm::SmallVector EscapedVariableLengthDecls; llvm::SmallVector EscapedVariableLengthDeclsAddrs; - const RecordDecl *GlobalRecord = nullptr; - llvm::Optional SecondaryGlobalRecord = llvm::None; - llvm::Value *GlobalRecordAddr = nullptr; llvm::Value *IsInSPMDModeFlag = nullptr; std::unique_ptr MappedParams; }; /// Maps the function to the list of the globalized variables with their /// addresses. llvm::SmallDenseMap FunctionGlobalizedDecls; - /// List of records for the globalized variables in target/teams/distribute - /// contexts. Inner records are going to be joined into the single record, - /// while those resulting records are going to be joined into the single - /// union. This resulting union (one per CU) is the entry point for the static - /// memory management runtime functions. - struct GlobalPtrSizeRecsTy { - llvm::GlobalVariable *UseSharedMemory = nullptr; - llvm::GlobalVariable *RecSize = nullptr; - llvm::GlobalVariable *Buffer = nullptr; - SourceLocation Loc; - llvm::SmallVector Records; - unsigned RegionCounter = 0; - }; - llvm::SmallVector GlobalizedRecords; llvm::GlobalVariable *KernelTeamsReductionPtr = nullptr; /// List of the records with the list of fields for the reductions across the /// teams. Used to build the intermediate buffer for the fast teams diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -1102,17 +1102,6 @@ } Action(EST, WST); CodeGen.setAction(Action); IsInTTDRegion = true; - // Reserve place for the globalized memory. - GlobalizedRecords.emplace_back(); - if (!KernelStaticGlobalized) { - KernelStaticGlobalized = new llvm::GlobalVariable( - CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false, - llvm::GlobalValue::InternalLinkage, - llvm::UndefValue::get(CGM.VoidPtrTy), - "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr, - llvm::GlobalValue::NotThreadLocal, - CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared)); - } emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, CodeGen); IsInTTDRegion = false; @@ -1164,10 +1153,6 @@ CGM.getModule(), OMPRTL___kmpc_kernel_init), Args); - // For data sharing, we need to initialize the stack. - CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_data_sharing_init_stack)); - emitGenericVarsProlog(CGF, WST.Loc); } @@ -1236,17 +1221,6 @@ } Action(*this, EST, D); CodeGen.setAction(Action); IsInTTDRegion = true; - // Reserve place for the globalized memory. - GlobalizedRecords.emplace_back(); - if (!KernelStaticGlobalized) { - KernelStaticGlobalized = new llvm::GlobalVariable( - CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false, - llvm::GlobalValue::InternalLinkage, - llvm::UndefValue::get(CGM.VoidPtrTy), - "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr, - llvm::GlobalValue::NotThreadLocal, - CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared)); - } emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, CodeGen); IsInTTDRegion = false; @@ -1268,12 +1242,6 @@ CGM.getModule(), OMPRTL___kmpc_spmd_kernel_init), Args); - if (RequiresFullRuntime) { - // For data sharing, we need to initialize the stack. - CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_data_sharing_init_stack_spmd)); - } - CGF.EmitBranch(ExecuteBB); CGF.EmitBlock(ExecuteBB); @@ -1679,16 +1647,13 @@ static_cast(CGF.CGM.getOpenMPRuntime()); if (GlobalizedRD) { auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first; - I->getSecond().GlobalRecord = GlobalizedRD; I->getSecond().MappedParams = std::make_unique(); DeclToAddrMapTy &Data = I->getSecond().LocalVarData; for (const auto &Pair : MappedDeclsFields) { assert(Pair.getFirst()->isCanonicalDecl() && "Expected canonical declaration"); - Data.insert(std::make_pair(Pair.getFirst(), - MappedVarData(Pair.getSecond(), - /*IsOnePerTeam=*/true))); + Data.insert(std::make_pair(Pair.getFirst(), MappedVarData())); } } Rt.emitGenericVarsProlog(CGF, Loc); @@ -1717,282 +1682,69 @@ const auto I = FunctionGlobalizedDecls.find(CGF.CurFn); if (I == FunctionGlobalizedDecls.end()) return; - if (const RecordDecl *GlobalizedVarsRecord = I->getSecond().GlobalRecord) { - QualType GlobalRecTy = CGM.getContext().getRecordType(GlobalizedVarsRecord); - QualType SecGlobalRecTy; - // Recover pointer to this function's global record. The runtime will - // handle the specifics of the allocation of the memory. - // Use actual memory size of the record including the padding - // for alignment purposes. - unsigned Alignment = - CGM.getContext().getTypeAlignInChars(GlobalRecTy).getQuantity(); - unsigned GlobalRecordSize = - CGM.getContext().getTypeSizeInChars(GlobalRecTy).getQuantity(); - GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment); - - llvm::PointerType *GlobalRecPtrTy = - CGF.ConvertTypeForMem(GlobalRecTy)->getPointerTo(); - llvm::Value *GlobalRecCastAddr; - llvm::Value *IsTTD = nullptr; - if (!IsInTTDRegion && - (WithSPMDCheck || - getExecutionMode() == CGOpenMPRuntimeGPU::EM_Unknown)) { - llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit"); - llvm::BasicBlock *SPMDBB = CGF.createBasicBlock(".spmd"); - llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd"); - if (I->getSecond().SecondaryGlobalRecord.hasValue()) { - llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc); - llvm::Value *ThreadID = getThreadID(CGF, Loc); - llvm::Value *PL = CGF.EmitRuntimeCall( - OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), - OMPRTL___kmpc_parallel_level), - {RTLoc, ThreadID}); - IsTTD = Bld.CreateIsNull(PL); - } - llvm::Value *IsSPMD = Bld.CreateIsNotNull( - CGF.EmitNounwindRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_is_spmd_exec_mode))); - Bld.CreateCondBr(IsSPMD, SPMDBB, NonSPMDBB); - // There is no need to emit line number for unconditional branch. - (void)ApplyDebugLocation::CreateEmpty(CGF); - CGF.EmitBlock(SPMDBB); - Address RecPtr = Address(llvm::ConstantPointerNull::get(GlobalRecPtrTy), - CharUnits::fromQuantity(Alignment)); - CGF.EmitBranch(ExitBB); - // There is no need to emit line number for unconditional branch. - (void)ApplyDebugLocation::CreateEmpty(CGF); - CGF.EmitBlock(NonSPMDBB); - llvm::Value *Size = llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize); - if (const RecordDecl *SecGlobalizedVarsRecord = - I->getSecond().SecondaryGlobalRecord.getValueOr(nullptr)) { - SecGlobalRecTy = - CGM.getContext().getRecordType(SecGlobalizedVarsRecord); - - // Recover pointer to this function's global record. The runtime will - // handle the specifics of the allocation of the memory. - // Use actual memory size of the record including the padding - // for alignment purposes. - unsigned Alignment = - CGM.getContext().getTypeAlignInChars(SecGlobalRecTy).getQuantity(); - unsigned GlobalRecordSize = - CGM.getContext().getTypeSizeInChars(SecGlobalRecTy).getQuantity(); - GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment); - Size = Bld.CreateSelect( - IsTTD, llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize), Size); - } - // TODO: allow the usage of shared memory to be controlled by - // the user, for now, default to global. - llvm::Value *GlobalRecordSizeArg[] = { - Size, CGF.Builder.getInt16(/*UseSharedMemory=*/0)}; - llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall( - OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_data_sharing_coalesced_push_stack), - GlobalRecordSizeArg); - GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast( - GlobalRecValue, GlobalRecPtrTy); - CGF.EmitBlock(ExitBB); - auto *Phi = Bld.CreatePHI(GlobalRecPtrTy, - /*NumReservedValues=*/2, "_select_stack"); - Phi->addIncoming(RecPtr.getPointer(), SPMDBB); - Phi->addIncoming(GlobalRecCastAddr, NonSPMDBB); - GlobalRecCastAddr = Phi; - I->getSecond().GlobalRecordAddr = Phi; - I->getSecond().IsInSPMDModeFlag = IsSPMD; - } else if (!CGM.getLangOpts().OpenMPCUDATargetParallel && IsInTTDRegion) { - assert(GlobalizedRecords.back().Records.size() < 2 && - "Expected less than 2 globalized records: one for target and one " - "for teams."); - unsigned Offset = 0; - for (const RecordDecl *RD : GlobalizedRecords.back().Records) { - QualType RDTy = CGM.getContext().getRecordType(RD); - unsigned Alignment = - CGM.getContext().getTypeAlignInChars(RDTy).getQuantity(); - unsigned Size = CGM.getContext().getTypeSizeInChars(RDTy).getQuantity(); - Offset = - llvm::alignTo(llvm::alignTo(Offset, Alignment) + Size, Alignment); - } - unsigned Alignment = - CGM.getContext().getTypeAlignInChars(GlobalRecTy).getQuantity(); - Offset = llvm::alignTo(Offset, Alignment); - GlobalizedRecords.back().Records.push_back(GlobalizedVarsRecord); - ++GlobalizedRecords.back().RegionCounter; - if (GlobalizedRecords.back().Records.size() == 1) { - assert(KernelStaticGlobalized && - "Kernel static pointer must be initialized already."); - auto *UseSharedMemory = new llvm::GlobalVariable( - CGM.getModule(), CGM.Int16Ty, /*isConstant=*/true, - llvm::GlobalValue::InternalLinkage, nullptr, - "_openmp_static_kernel$is_shared"); - UseSharedMemory->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global); - QualType Int16Ty = CGM.getContext().getIntTypeForBitwidth( - /*DestWidth=*/16, /*Signed=*/0); - llvm::Value *IsInSharedMemory = CGF.EmitLoadOfScalar( - Address(UseSharedMemory, - CGM.getContext().getTypeAlignInChars(Int16Ty)), - /*Volatile=*/false, Int16Ty, Loc); - auto *StaticGlobalized = new llvm::GlobalVariable( - CGM.getModule(), CGM.Int8Ty, /*isConstant=*/false, - llvm::GlobalValue::CommonLinkage, nullptr); - auto *RecSize = new llvm::GlobalVariable( - CGM.getModule(), CGM.SizeTy, /*isConstant=*/true, - llvm::GlobalValue::InternalLinkage, nullptr, - "_openmp_static_kernel$size"); - RecSize->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global); - llvm::Value *Ld = CGF.EmitLoadOfScalar( - Address(RecSize, CGM.getSizeAlign()), /*Volatile=*/false, - CGM.getContext().getSizeType(), Loc); - llvm::Value *ResAddr = Bld.CreatePointerBitCastOrAddrSpaceCast( - KernelStaticGlobalized, CGM.VoidPtrPtrTy); - llvm::Value *GlobalRecordSizeArg[] = { - llvm::ConstantInt::get( - CGM.Int16Ty, - getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD ? 1 : 0), - StaticGlobalized, Ld, IsInSharedMemory, ResAddr}; - CGF.EmitRuntimeCall( - OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_get_team_static_memory), - GlobalRecordSizeArg); - GlobalizedRecords.back().Buffer = StaticGlobalized; - GlobalizedRecords.back().RecSize = RecSize; - GlobalizedRecords.back().UseSharedMemory = UseSharedMemory; - GlobalizedRecords.back().Loc = Loc; - } - assert(KernelStaticGlobalized && "Global address must be set already."); - Address FrameAddr = CGF.EmitLoadOfPointer( - Address(KernelStaticGlobalized, CGM.getPointerAlign()), - CGM.getContext() - .getPointerType(CGM.getContext().VoidPtrTy) - .castAs()); - llvm::Value *GlobalRecValue = - Bld.CreateConstInBoundsGEP(FrameAddr, Offset).getPointer(); - I->getSecond().GlobalRecordAddr = GlobalRecValue; - I->getSecond().IsInSPMDModeFlag = nullptr; - GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast( - GlobalRecValue, CGF.ConvertTypeForMem(GlobalRecTy)->getPointerTo()); - } else { - // TODO: allow the usage of shared memory to be controlled by - // the user, for now, default to global. - bool UseSharedMemory = - IsInTTDRegion && GlobalRecordSize <= SharedMemorySize; - llvm::Value *GlobalRecordSizeArg[] = { - llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize), - CGF.Builder.getInt16(UseSharedMemory ? 1 : 0)}; - llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall( - OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), - IsInTTDRegion ? OMPRTL___kmpc_data_sharing_push_stack - : OMPRTL___kmpc_data_sharing_coalesced_push_stack), - GlobalRecordSizeArg); - GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast( - GlobalRecValue, GlobalRecPtrTy); - I->getSecond().GlobalRecordAddr = GlobalRecValue; - I->getSecond().IsInSPMDModeFlag = nullptr; - } - LValue Base = - CGF.MakeNaturalAlignPointeeAddrLValue(GlobalRecCastAddr, GlobalRecTy); - - // Emit the "global alloca" which is a GEP from the global declaration - // record using the pointer returned by the runtime. - LValue SecBase; - decltype(I->getSecond().LocalVarData)::const_iterator SecIt; - if (IsTTD) { - SecIt = I->getSecond().SecondaryLocalVarData->begin(); - llvm::PointerType *SecGlobalRecPtrTy = - CGF.ConvertTypeForMem(SecGlobalRecTy)->getPointerTo(); - SecBase = CGF.MakeNaturalAlignPointeeAddrLValue( - Bld.CreatePointerBitCastOrAddrSpaceCast( - I->getSecond().GlobalRecordAddr, SecGlobalRecPtrTy), - SecGlobalRecTy); + for (auto &Rec : I->getSecond().LocalVarData) { + const auto *VD = cast(Rec.first); + bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first); + QualType VarTy = VD->getType(); + + // Get the local allocation of a firstprivate variable before sharing + llvm::Value *ParValue; + if (EscapedParam) { + LValue ParLVal = + CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType()); + ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc); } - for (auto &Rec : I->getSecond().LocalVarData) { - bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first); - llvm::Value *ParValue; - if (EscapedParam) { - const auto *VD = cast(Rec.first); - LValue ParLVal = - CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType()); - ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc); - } - LValue VarAddr = CGF.EmitLValueForField(Base, Rec.second.FD); - // Emit VarAddr basing on lane-id if required. - QualType VarTy; - if (Rec.second.IsOnePerTeam) { - VarTy = Rec.second.FD->getType(); - } else { - Address Addr = VarAddr.getAddress(CGF); - llvm::Value *Ptr = CGF.Builder.CreateInBoundsGEP( - Addr.getElementType(), Addr.getPointer(), - {Bld.getInt32(0), getNVPTXLaneID(CGF)}); - VarTy = - Rec.second.FD->getType()->castAsArrayTypeUnsafe()->getElementType(); - VarAddr = CGF.MakeAddrLValue( - Address(Ptr, CGM.getContext().getDeclAlign(Rec.first)), VarTy, - AlignmentSource::Decl); - } - Rec.second.PrivateAddr = VarAddr.getAddress(CGF); - if (!IsInTTDRegion && - (WithSPMDCheck || - getExecutionMode() == CGOpenMPRuntimeGPU::EM_Unknown)) { - assert(I->getSecond().IsInSPMDModeFlag && - "Expected unknown execution mode or required SPMD check."); - if (IsTTD) { - assert(SecIt->second.IsOnePerTeam && - "Secondary glob data must be one per team."); - LValue SecVarAddr = CGF.EmitLValueForField(SecBase, SecIt->second.FD); - VarAddr.setAddress( - Address(Bld.CreateSelect(IsTTD, SecVarAddr.getPointer(CGF), - VarAddr.getPointer(CGF)), - VarAddr.getAlignment())); - Rec.second.PrivateAddr = VarAddr.getAddress(CGF); - } - Address GlobalPtr = Rec.second.PrivateAddr; - Address LocalAddr = CGF.CreateMemTemp(VarTy, Rec.second.FD->getName()); - Rec.second.PrivateAddr = Address( - Bld.CreateSelect(I->getSecond().IsInSPMDModeFlag, - LocalAddr.getPointer(), GlobalPtr.getPointer()), - LocalAddr.getAlignment()); - } - if (EscapedParam) { - const auto *VD = cast(Rec.first); - CGF.EmitStoreOfScalar(ParValue, VarAddr); - I->getSecond().MappedParams->setVarAddr(CGF, VD, - VarAddr.getAddress(CGF)); - } - if (IsTTD) - ++SecIt; + + // Allocate space for the variable to be globalized + llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())}; + llvm::Instruction *VoidPtr = + CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_alloc_shared), + AllocArgs, VD->getName()); + + // Cast the void pointer and get the address of the globalized variable. + llvm::PointerType *VarPtrTy = CGF.ConvertTypeForMem(VarTy)->getPointerTo(); + llvm::Value *CastedVoidPtr = Bld.CreatePointerBitCastOrAddrSpaceCast( + VoidPtr, VarPtrTy, VD->getName() + "_on_stack"); + LValue VarAddr = CGF.MakeNaturalAlignAddrLValue(CastedVoidPtr, VarTy); + Rec.second.PrivateAddr = VarAddr.getAddress(CGF); + Rec.second.GlobalizedVal = VoidPtr; + + // Assign the local allocation to the newly globalized location. + if (EscapedParam) { + CGF.EmitStoreOfScalar(ParValue, VarAddr); + I->getSecond().MappedParams->setVarAddr(CGF, VD, VarAddr.getAddress(CGF)); } + if (auto *DI = CGF.getDebugInfo()) + VoidPtr->setDebugLoc(DI->SourceLocToDebugLoc(VD->getLocation())); } - for (const ValueDecl *VD : I->getSecond().EscapedVariableLengthDecls) { - // Recover pointer to this function's global record. The runtime will - // handle the specifics of the allocation of the memory. - // Use actual memory size of the record including the padding + for (const auto *VD : I->getSecond().EscapedVariableLengthDecls) { + // Use actual memory size of the VLA object including the padding // for alignment purposes. - CGBuilderTy &Bld = CGF.Builder; llvm::Value *Size = CGF.getTypeSize(VD->getType()); CharUnits Align = CGM.getContext().getDeclAlign(VD); Size = Bld.CreateNUWAdd( Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1)); llvm::Value *AlignVal = llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity()); + Size = Bld.CreateUDiv(Size, AlignVal); Size = Bld.CreateNUWMul(Size, AlignVal); - // TODO: allow the usage of shared memory to be controlled by - // the user, for now, default to global. - llvm::Value *GlobalRecordSizeArg[] = { - Size, CGF.Builder.getInt16(/*UseSharedMemory=*/0)}; - llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall( - OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_data_sharing_coalesced_push_stack), - GlobalRecordSizeArg); - llvm::Value *GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast( - GlobalRecValue, CGF.ConvertTypeForMem(VD->getType())->getPointerTo()); - LValue Base = CGF.MakeAddrLValue(GlobalRecCastAddr, VD->getType(), + + // Allocate space for this VLA object to be globalized. + llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())}; + llvm::Instruction *VoidPtr = + CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_alloc_shared), + AllocArgs, VD->getName()); + + I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(VoidPtr); + LValue Base = CGF.MakeAddrLValue(VoidPtr, VD->getType(), CGM.getContext().getDeclAlign(VD), AlignmentSource::Decl); I->getSecond().MappedParams->setVarAddr(CGF, cast(VD), Base.getAddress(CGF)); - I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(GlobalRecValue); } I->getSecond().MappedParams->apply(CGF); } @@ -2005,60 +1757,20 @@ const auto I = FunctionGlobalizedDecls.find(CGF.CurFn); if (I != FunctionGlobalizedDecls.end()) { - I->getSecond().MappedParams->restore(CGF); - if (!CGF.HaveInsertPoint()) - return; + // Deallocate the memory for each globalized VLA object for (llvm::Value *Addr : llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) { - CGF.EmitRuntimeCall( - OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_data_sharing_pop_stack), - Addr); + CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_free_shared), + Addr); } - if (I->getSecond().GlobalRecordAddr) { - if (!IsInTTDRegion && - (WithSPMDCheck || - getExecutionMode() == CGOpenMPRuntimeGPU::EM_Unknown)) { - CGBuilderTy &Bld = CGF.Builder; - llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit"); - llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd"); - Bld.CreateCondBr(I->getSecond().IsInSPMDModeFlag, ExitBB, NonSPMDBB); - // There is no need to emit line number for unconditional branch. - (void)ApplyDebugLocation::CreateEmpty(CGF); - CGF.EmitBlock(NonSPMDBB); - CGF.EmitRuntimeCall( - OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_data_sharing_pop_stack), - CGF.EmitCastToVoidPtr(I->getSecond().GlobalRecordAddr)); - CGF.EmitBlock(ExitBB); - } else if (!CGM.getLangOpts().OpenMPCUDATargetParallel && IsInTTDRegion) { - assert(GlobalizedRecords.back().RegionCounter > 0 && - "region counter must be > 0."); - --GlobalizedRecords.back().RegionCounter; - // Emit the restore function only in the target region. - if (GlobalizedRecords.back().RegionCounter == 0) { - QualType Int16Ty = CGM.getContext().getIntTypeForBitwidth( - /*DestWidth=*/16, /*Signed=*/0); - llvm::Value *IsInSharedMemory = CGF.EmitLoadOfScalar( - Address(GlobalizedRecords.back().UseSharedMemory, - CGM.getContext().getTypeAlignInChars(Int16Ty)), - /*Volatile=*/false, Int16Ty, GlobalizedRecords.back().Loc); - llvm::Value *Args[] = { - llvm::ConstantInt::get( - CGM.Int16Ty, - getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD ? 1 : 0), - IsInSharedMemory}; - CGF.EmitRuntimeCall( - OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_restore_team_static_memory), - Args); - } - } else { - CGF.EmitRuntimeCall( - OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_data_sharing_pop_stack), - I->getSecond().GlobalRecordAddr); - } + // Deallocate the memory for each globalized value + for (auto &Rec : llvm::reverse(I->getSecond().LocalVarData)) { + I->getSecond().MappedParams->restore(CGF); + + CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_free_shared), + {Rec.second.GlobalizedVal}); } } } @@ -4170,6 +3882,7 @@ } if (!Body) return; + CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second); VarChecker.Visit(Body); const RecordDecl *GlobalizedVarsRecord = @@ -4183,7 +3896,6 @@ auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first; I->getSecond().MappedParams = std::make_unique(); - I->getSecond().GlobalRecord = GlobalizedVarsRecord; I->getSecond().EscapedParameters.insert( VarChecker.getEscapedParameters().begin(), VarChecker.getEscapedParameters().end()); @@ -4192,21 +3904,16 @@ DeclToAddrMapTy &Data = I->getSecond().LocalVarData; for (const ValueDecl *VD : VarChecker.getEscapedDecls()) { assert(VD->isCanonicalDecl() && "Expected canonical declaration"); - const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD); - Data.insert(std::make_pair(VD, MappedVarData(FD, IsInTTDRegion))); + Data.insert(std::make_pair(VD, MappedVarData())); } if (!IsInTTDRegion && !NeedToDelayGlobalization && !IsInParallelRegion) { CheckVarsEscapingDeclContext VarChecker(CGF, llvm::None); VarChecker.Visit(Body); - I->getSecond().SecondaryGlobalRecord = - VarChecker.getGlobalizedRecord(/*IsInTTDRegion=*/true); I->getSecond().SecondaryLocalVarData.emplace(); DeclToAddrMapTy &Data = I->getSecond().SecondaryLocalVarData.getValue(); for (const ValueDecl *VD : VarChecker.getEscapedDecls()) { assert(VD->isCanonicalDecl() && "Expected canonical declaration"); - const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD); - Data.insert( - std::make_pair(VD, MappedVarData(FD, /*IsInTTDRegion=*/true))); + Data.insert(std::make_pair(VD, MappedVarData())); } } if (!NeedToDelayGlobalization) { @@ -4497,185 +4204,8 @@ CGOpenMPRuntime::processRequiresDirective(D); } -/// Get number of SMs and number of blocks per SM. -static std::pair getSMsBlocksPerSM(CodeGenModule &CGM) { - std::pair Data; - if (CGM.getLangOpts().OpenMPCUDANumSMs) - Data.first = CGM.getLangOpts().OpenMPCUDANumSMs; - if (CGM.getLangOpts().OpenMPCUDABlocksPerSM) - Data.second = CGM.getLangOpts().OpenMPCUDABlocksPerSM; - if (Data.first && Data.second) - return Data; - switch (getCudaArch(CGM)) { - case CudaArch::SM_20: - case CudaArch::SM_21: - case CudaArch::SM_30: - case CudaArch::SM_32: - case CudaArch::SM_35: - case CudaArch::SM_37: - case CudaArch::SM_50: - case CudaArch::SM_52: - case CudaArch::SM_53: - return {16, 16}; - case CudaArch::SM_60: - case CudaArch::SM_61: - case CudaArch::SM_62: - return {56, 32}; - case CudaArch::SM_70: - case CudaArch::SM_72: - case CudaArch::SM_75: - case CudaArch::SM_80: - case CudaArch::SM_86: - return {84, 32}; - case CudaArch::GFX600: - case CudaArch::GFX601: - case CudaArch::GFX602: - case CudaArch::GFX700: - case CudaArch::GFX701: - case CudaArch::GFX702: - case CudaArch::GFX703: - case CudaArch::GFX704: - case CudaArch::GFX705: - case CudaArch::GFX801: - case CudaArch::GFX802: - case CudaArch::GFX803: - case CudaArch::GFX805: - case CudaArch::GFX810: - case CudaArch::GFX900: - case CudaArch::GFX902: - case CudaArch::GFX904: - case CudaArch::GFX906: - case CudaArch::GFX908: - case CudaArch::GFX909: - case CudaArch::GFX90a: - case CudaArch::GFX90c: - case CudaArch::GFX1010: - case CudaArch::GFX1011: - case CudaArch::GFX1012: - case CudaArch::GFX1030: - case CudaArch::GFX1031: - case CudaArch::GFX1032: - case CudaArch::GFX1033: - case CudaArch::UNUSED: - case CudaArch::UNKNOWN: - break; - case CudaArch::LAST: - llvm_unreachable("Unexpected Cuda arch."); - } - llvm_unreachable("Unexpected NVPTX target without ptx feature."); -} - void CGOpenMPRuntimeGPU::clear() { - if (!GlobalizedRecords.empty() && - !CGM.getLangOpts().OpenMPCUDATargetParallel) { - ASTContext &C = CGM.getContext(); - llvm::SmallVector GlobalRecs; - llvm::SmallVector SharedRecs; - RecordDecl *StaticRD = C.buildImplicitRecord( - "_openmp_static_memory_type_$_", RecordDecl::TagKind::TTK_Union); - StaticRD->startDefinition(); - RecordDecl *SharedStaticRD = C.buildImplicitRecord( - "_shared_openmp_static_memory_type_$_", RecordDecl::TagKind::TTK_Union); - SharedStaticRD->startDefinition(); - for (const GlobalPtrSizeRecsTy &Records : GlobalizedRecords) { - if (Records.Records.empty()) - continue; - unsigned Size = 0; - unsigned RecAlignment = 0; - for (const RecordDecl *RD : Records.Records) { - QualType RDTy = C.getRecordType(RD); - unsigned Alignment = C.getTypeAlignInChars(RDTy).getQuantity(); - RecAlignment = std::max(RecAlignment, Alignment); - unsigned RecSize = C.getTypeSizeInChars(RDTy).getQuantity(); - Size = - llvm::alignTo(llvm::alignTo(Size, Alignment) + RecSize, Alignment); - } - Size = llvm::alignTo(Size, RecAlignment); - llvm::APInt ArySize(/*numBits=*/64, Size); - QualType SubTy = C.getConstantArrayType( - C.CharTy, ArySize, nullptr, ArrayType::Normal, /*IndexTypeQuals=*/0); - const bool UseSharedMemory = Size <= SharedMemorySize; - auto *Field = - FieldDecl::Create(C, UseSharedMemory ? SharedStaticRD : StaticRD, - SourceLocation(), SourceLocation(), nullptr, SubTy, - C.getTrivialTypeSourceInfo(SubTy, SourceLocation()), - /*BW=*/nullptr, /*Mutable=*/false, - /*InitStyle=*/ICIS_NoInit); - Field->setAccess(AS_public); - if (UseSharedMemory) { - SharedStaticRD->addDecl(Field); - SharedRecs.push_back(&Records); - } else { - StaticRD->addDecl(Field); - GlobalRecs.push_back(&Records); - } - Records.RecSize->setInitializer(llvm::ConstantInt::get(CGM.SizeTy, Size)); - Records.UseSharedMemory->setInitializer( - llvm::ConstantInt::get(CGM.Int16Ty, UseSharedMemory ? 1 : 0)); - } - // Allocate SharedMemorySize buffer for the shared memory. - // FIXME: nvlink does not handle weak linkage correctly (object with the - // different size are reported as erroneous). - // Restore this code as sson as nvlink is fixed. - if (!SharedStaticRD->field_empty()) { - llvm::APInt ArySize(/*numBits=*/64, SharedMemorySize); - QualType SubTy = C.getConstantArrayType( - C.CharTy, ArySize, nullptr, ArrayType::Normal, /*IndexTypeQuals=*/0); - auto *Field = FieldDecl::Create( - C, SharedStaticRD, SourceLocation(), SourceLocation(), nullptr, SubTy, - C.getTrivialTypeSourceInfo(SubTy, SourceLocation()), - /*BW=*/nullptr, /*Mutable=*/false, - /*InitStyle=*/ICIS_NoInit); - Field->setAccess(AS_public); - SharedStaticRD->addDecl(Field); - } - SharedStaticRD->completeDefinition(); - if (!SharedStaticRD->field_empty()) { - QualType StaticTy = C.getRecordType(SharedStaticRD); - llvm::Type *LLVMStaticTy = CGM.getTypes().ConvertTypeForMem(StaticTy); - auto *GV = new llvm::GlobalVariable( - CGM.getModule(), LLVMStaticTy, - /*isConstant=*/false, llvm::GlobalValue::WeakAnyLinkage, - llvm::UndefValue::get(LLVMStaticTy), - "_openmp_shared_static_glob_rd_$_", /*InsertBefore=*/nullptr, - llvm::GlobalValue::NotThreadLocal, - C.getTargetAddressSpace(LangAS::cuda_shared)); - auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast( - GV, CGM.VoidPtrTy); - for (const GlobalPtrSizeRecsTy *Rec : SharedRecs) { - Rec->Buffer->replaceAllUsesWith(Replacement); - Rec->Buffer->eraseFromParent(); - } - } - StaticRD->completeDefinition(); - if (!StaticRD->field_empty()) { - QualType StaticTy = C.getRecordType(StaticRD); - std::pair SMsBlockPerSM = getSMsBlocksPerSM(CGM); - llvm::APInt Size1(32, SMsBlockPerSM.second); - QualType Arr1Ty = - C.getConstantArrayType(StaticTy, Size1, nullptr, ArrayType::Normal, - /*IndexTypeQuals=*/0); - llvm::APInt Size2(32, SMsBlockPerSM.first); - QualType Arr2Ty = - C.getConstantArrayType(Arr1Ty, Size2, nullptr, ArrayType::Normal, - /*IndexTypeQuals=*/0); - llvm::Type *LLVMArr2Ty = CGM.getTypes().ConvertTypeForMem(Arr2Ty); - // FIXME: nvlink does not handle weak linkage correctly (object with the - // different size are reported as erroneous). - // Restore CommonLinkage as soon as nvlink is fixed. - auto *GV = new llvm::GlobalVariable( - CGM.getModule(), LLVMArr2Ty, - /*isConstant=*/false, llvm::GlobalValue::InternalLinkage, - llvm::Constant::getNullValue(LLVMArr2Ty), - "_openmp_static_glob_rd_$_"); - auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast( - GV, CGM.VoidPtrTy); - for (const GlobalPtrSizeRecsTy *Rec : GlobalRecs) { - Rec->Buffer->replaceAllUsesWith(Replacement); - Rec->Buffer->eraseFromParent(); - } - } - } + if (!TeamsReductions.empty()) { ASTContext &C = CGM.getContext(); RecordDecl *StaticRD = C.buildImplicitRecord( diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -5626,13 +5626,6 @@ options::OPT_fno_openmp_cuda_mode, /*Default=*/false)) CmdArgs.push_back("-fopenmp-cuda-mode"); - // When in OpenMP offloading mode with NVPTX target, forward - // cuda-parallel-target-regions flag - if (Args.hasFlag(options::OPT_fopenmp_cuda_parallel_target_regions, - options::OPT_fno_openmp_cuda_parallel_target_regions, - /*Default=*/true)) - CmdArgs.push_back("-fopenmp-cuda-parallel-target-regions"); - // When in OpenMP offloading mode with NVPTX target, check if full runtime // is required. if (Args.hasFlag(options::OPT_fopenmp_cuda_force_full_runtime, diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp --- a/clang/lib/Frontend/CompilerInvocation.cpp +++ b/clang/lib/Frontend/CompilerInvocation.cpp @@ -3464,9 +3464,6 @@ if (Opts.OpenMPCUDAMode) GenerateArg(Args, OPT_fopenmp_cuda_mode, SA); - if (Opts.OpenMPCUDATargetParallel) - GenerateArg(Args, OPT_fopenmp_cuda_parallel_target_regions, SA); - if (Opts.OpenMPCUDAForceFullRuntime) GenerateArg(Args, OPT_fopenmp_cuda_force_full_runtime, SA); @@ -3882,12 +3879,6 @@ Opts.OpenMPCUDAMode = Opts.OpenMPIsDevice && (T.isNVPTX() || T.isAMDGCN()) && Args.hasArg(options::OPT_fopenmp_cuda_mode); - // Set CUDA support for parallel execution of target regions for OpenMP target - // NVPTX/AMDGCN if specified in options. - Opts.OpenMPCUDATargetParallel = - Opts.OpenMPIsDevice && (T.isNVPTX() || T.isAMDGCN()) && - Args.hasArg(options::OPT_fopenmp_cuda_parallel_target_regions); - // Set CUDA mode for OpenMP target NVPTX/AMDGCN if specified in options Opts.OpenMPCUDAForceFullRuntime = Opts.OpenMPIsDevice && (T.isNVPTX() || T.isAMDGCN()) && diff --git a/clang/test/OpenMP/assumes_include_nvptx.cpp b/clang/test/OpenMP/assumes_include_nvptx.cpp --- a/clang/test/OpenMP/assumes_include_nvptx.cpp +++ b/clang/test/OpenMP/assumes_include_nvptx.cpp @@ -19,8 +19,6 @@ // CHECK-DAG: declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() [[attr1]] // CHECK: declare void @__kmpc_kernel_init(i32, i16) // CHECK-NOT: # -// CHECK: declare void @__kmpc_data_sharing_init_stack() -// CHECK-NOT: # // CHECK: declare float @_Z3sinf(float) [[attr2:#[0-9]*]] // CHECK: declare void @__kmpc_kernel_deinit(i16) // CHECK-NOT: # diff --git a/clang/test/OpenMP/declare_target_codegen_globalization.cpp b/clang/test/OpenMP/declare_target_codegen_globalization.cpp --- a/clang/test/OpenMP/declare_target_codegen_globalization.cpp +++ b/clang/test/OpenMP/declare_target_codegen_globalization.cpp @@ -1,3 +1,4 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-function-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+_(.*)_l[0-9]+" // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-nvidia-cuda -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s // expected-no-diagnostics @@ -9,8 +10,6 @@ return foo(a); } -// CHECK: define weak void @__omp_offloading_{{.*}}maini1{{.*}}_l[[@LINE+5]](i32* nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %{{.*}}) -// CHECK-NOT: @__kmpc_data_sharing_coalesced_push_stack int maini1() { int a; @@ -23,36 +22,66 @@ } // parallel region -// CHECK: define {{.*}}void @{{.*}}(i32* noalias {{.*}}, i32* noalias {{.*}}, i32* nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %{{.*}}) -// CHECK-NOT: call i8* @__kmpc_data_sharing_coalesced_push_stack( -// CHECK: [[B_ADDR:%.+]] = alloca i32, -// CHECK: call {{.*}}[[FOO:@.*foo.*]](i32* nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) [[B_ADDR]]) -// CHECK: call {{.*}}[[BAR:@.*bar.*]]() -// CHECK-NOT: call void @__kmpc_data_sharing_pop_stack( -// CHECK: ret void -// CHECK: define {{.*}}[[FOO]](i32* nonnull align {{[0-9]+}} dereferenceable{{.*}}) -// CHECK-NOT: @__kmpc_data_sharing_coalesced_push_stack - -// CHECK: define {{.*}}[[BAR]]() -// CHECK: alloca i32, -// CHECK: [[A_LOCAL_ADDR:%.+]] = alloca i32, -// CHECK: [[RES:%.+]] = call i8 @__kmpc_is_spmd_exec_mode() -// CHECK: [[IS_SPMD:%.+]] = icmp ne i8 [[RES]], 0 -// CHECK: br i1 [[IS_SPMD]], label -// CHECK: br label -// CHECK: [[RES:%.+]] = call i8* @__kmpc_data_sharing_coalesced_push_stack(i64 128, i16 0) -// CHECK: [[GLOBALS:%.+]] = bitcast i8* [[RES]] to [[GLOBAL_ST:%.+]]* -// CHECK: br label -// CHECK: [[ITEMS:%.+]] = phi [[GLOBAL_ST]]* [ null, {{.+}} ], [ [[GLOBALS]], {{.+}} ] -// CHECK: [[A_ADDR:%.+]] = getelementptr inbounds [[GLOBAL_ST]], [[GLOBAL_ST]]* [[ITEMS]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 -// CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK: [[LID:%.+]] = and i32 [[TID]], 31 -// CHECK: [[A_GLOBAL_ADDR:%.+]] = getelementptr inbounds [32 x i32], [32 x i32]* [[A_ADDR]], i32 0, i32 [[LID]] -// CHECK: [[A_ADDR:%.+]] = select i1 [[IS_SPMD]], i32* [[A_LOCAL_ADDR]], i32* [[A_GLOBAL_ADDR]] -// CHECK: call {{.*}}[[FOO]](i32* nonnull align {{[0-9]+}} dereferenceable{{.*}} [[A_ADDR]]) -// CHECK: br i1 [[IS_SPMD]], label -// CHECK: [[BC:%.+]] = bitcast [[GLOBAL_ST]]* [[ITEMS]] to i8* -// CHECK: call void @__kmpc_data_sharing_pop_stack(i8* [[BC]]) -// CHECK: br label -// CHECK: ret i32 +// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z6maini1v_l[0-9]+}} +// CHECK-SAME: (i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8 +// CHECK-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) +// CHECK-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK: .execute: +// CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP0]] to i8* +// CHECK-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP4]], i64 1) +// CHECK-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK: .omp.deinit: +// CHECK-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) +// CHECK-NEXT: br label [[DOTEXIT:%.*]] +// CHECK: .exit: +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[B:%.*]] = alloca i32, align 4 +// CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK-NEXT: [[CALL:%.*]] = call i32 @_Z3fooRi(i32* nonnull align 4 dereferenceable(4) [[B]]) #[[ATTR4:[0-9]+]] +// CHECK-NEXT: [[CALL1:%.*]] = call i32 @_Z3barv() #[[ATTR4]] +// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[CALL]], [[CALL1]] +// CHECK-NEXT: store i32 [[ADD]], i32* [[TMP0]], align 4 +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@_Z3fooRi +// CHECK-SAME: (i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK-NEXT: ret i32 [[TMP1]] +// +// +// CHECK-LABEL: define {{[^@]+}}@_Z3barv +// CHECK-SAME: () #[[ATTR2]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[A:%.*]] = call i8* @__kmpc_alloc_shared(i64 4) +// CHECK-NEXT: [[A_ON_STACK:%.*]] = bitcast i8* [[A]] to i32* +// CHECK-NEXT: [[CALL:%.*]] = call i32 @_Z3fooRi(i32* nonnull align 4 dereferenceable(4) [[A_ON_STACK]]) #[[ATTR4]] +// CHECK-NEXT: call void @__kmpc_free_shared(i8* [[A]]) +// CHECK-NEXT: ret i32 [[CALL]] +// diff --git a/clang/test/OpenMP/nvptx_data_sharing.cpp b/clang/test/OpenMP/nvptx_data_sharing.cpp --- a/clang/test/OpenMP/nvptx_data_sharing.cpp +++ b/clang/test/OpenMP/nvptx_data_sharing.cpp @@ -3,8 +3,8 @@ ///==========================================================================/// // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK1 -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK2 +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CK1 + // expected-no-diagnostics @@ -78,8 +78,6 @@ // CHECK1-NEXT: br label [[DOTAWAIT_WORK]] // CHECK1: .exit: // CHECK1-NEXT: ret void -// -// // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z7test_dsv_l15 // CHECK1-SAME: () #[[ATTR1:[0-9]+]] { // CHECK1-NEXT: entry: @@ -145,8 +143,6 @@ // CHECK1-NEXT: br label [[DOTEXIT]] // CHECK1: .exit: // CHECK1-NEXT: ret void -// -// // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__ // CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: @@ -159,8 +155,6 @@ // CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 // CHECK1-NEXT: store i32 1000, i32* [[TMP0]], align 4 // CHECK1-NEXT: ret void -// -// // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined___wrapper // CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { // CHECK1-NEXT: entry: @@ -178,8 +172,6 @@ // CHECK1-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 8 // CHECK1-NEXT: call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]] // CHECK1-NEXT: ret void -// -// // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1 // CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[B:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: @@ -200,8 +192,6 @@ // CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], 10000 // CHECK1-NEXT: store i32 [[ADD]], i32* [[TMP0]], align 4 // CHECK1-NEXT: ret void -// -// // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper // CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { // CHECK1-NEXT: entry: @@ -222,8 +212,6 @@ // CHECK1-NEXT: [[TMP8:%.*]] = load i32*, i32** [[TMP7]], align 8 // CHECK1-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]], i32* [[TMP8]]) #[[ATTR3]] // CHECK1-NEXT: ret void -// -// // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z7test_dsv_l15_worker // CHECK2-SAME: () #[[ATTR0:[0-9]+]] { // CHECK2-NEXT: entry: @@ -271,8 +259,6 @@ // CHECK2-NEXT: br label [[DOTAWAIT_WORK]] // CHECK2: .exit: // CHECK2-NEXT: ret void -// -// // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z7test_dsv_l15 // CHECK2-SAME: () #[[ATTR1:[0-9]+]] { // CHECK2-NEXT: entry: @@ -333,8 +319,6 @@ // CHECK2-NEXT: br label [[DOTEXIT]] // CHECK2: .exit: // CHECK2-NEXT: ret void -// -// // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__ // CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR1]] { // CHECK2-NEXT: entry: @@ -347,8 +331,6 @@ // CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 // CHECK2-NEXT: store i32 1000, i32* [[TMP0]], align 4 // CHECK2-NEXT: ret void -// -// // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined___wrapper // CHECK2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { // CHECK2-NEXT: entry: @@ -366,8 +348,6 @@ // CHECK2-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 8 // CHECK2-NEXT: call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]] // CHECK2-NEXT: ret void -// -// // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1 // CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[B:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR1]] { // CHECK2-NEXT: entry: @@ -388,8 +368,6 @@ // CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], 10000 // CHECK2-NEXT: store i32 [[ADD]], i32* [[TMP0]], align 4 // CHECK2-NEXT: ret void -// -// // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper // CHECK2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { // CHECK2-NEXT: entry: @@ -410,4 +388,190 @@ // CHECK2-NEXT: [[TMP8:%.*]] = load i32*, i32** [[TMP7]], align 8 // CHECK2-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]], i32* [[TMP8]]) #[[ATTR3]] // CHECK2-NEXT: ret void +// CK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z7test_dsv_l15_worker +// CK1-SAME: () #[[ATTR0:[0-9]+]] { +// CK1-NEXT: entry: +// CK1-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 +// CK1-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CK1-NEXT: store i8* null, i8** [[WORK_FN]], align 8 +// CK1-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CK1-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CK1: .await.work: +// CK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CK1-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CK1-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CK1-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CK1-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CK1-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CK1-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CK1: .select.workers: +// CK1-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CK1-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CK1-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CK1: .execute.parallel: +// CK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CK1-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*) +// CK1-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] +// CK1: .execute.fn: +// CK1-NEXT: call void @__omp_outlined___wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3:[0-9]+]] +// CK1-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CK1: .check.next: +// CK1-NEXT: [[TMP6:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CK1-NEXT: [[WORK_MATCH1:%.*]] = icmp eq i8* [[TMP6]], bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*) +// CK1-NEXT: br i1 [[WORK_MATCH1]], label [[DOTEXECUTE_FN2:%.*]], label [[DOTCHECK_NEXT3:%.*]] +// CK1: .execute.fn2: +// CK1-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] +// CK1-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CK1: .check.next3: +// CK1-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CK1-NEXT: call void [[TMP7]](i16 0, i32 [[TMP4]]) +// CK1-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CK1: .terminate.parallel: +// CK1-NEXT: call void @__kmpc_kernel_end_parallel() +// CK1-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CK1: .barrier.parallel: +// CK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CK1-NEXT: br label [[DOTAWAIT_WORK]] +// CK1: .exit: +// CK1-NEXT: ret void +// +// +// CK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z7test_dsv_l15 +// CK1-SAME: () #[[ATTR1:[0-9]+]] { +// CK1-NEXT: entry: +// CK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8 +// CK1-NEXT: [[C:%.*]] = alloca i32, align 4 +// CK1-NEXT: [[CAPTURED_VARS_ADDRS7:%.*]] = alloca [2 x i8*], align 8 +// CK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CK1-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CK1-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CK1-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CK1: .worker: +// CK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z7test_dsv_l15_worker() #[[ATTR3]] +// CK1-NEXT: br label [[DOTEXIT:%.*]] +// CK1: .mastercheck: +// CK1-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CK1-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CK1-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CK1-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CK1-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CK1-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CK1-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CK1-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CK1-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CK1: .master: +// CK1-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CK1-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CK1-NEXT: [[A:%.*]] = call i8* @__kmpc_alloc_shared(i64 4) +// CK1-NEXT: [[A_ON_STACK:%.*]] = bitcast i8* [[A]] to i32* +// CK1-NEXT: [[B:%.*]] = call i8* @__kmpc_alloc_shared(i64 4) +// CK1-NEXT: [[B_ON_STACK:%.*]] = bitcast i8* [[B]] to i32* +// CK1-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CK1-NEXT: store i32 10, i32* [[A_ON_STACK]], align 4 +// CK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CK1-NEXT: [[TMP7:%.*]] = bitcast i32* [[A_ON_STACK]] to i8* +// CK1-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 8 +// CK1-NEXT: [[TMP8:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP8]], i64 1) +// CK1-NEXT: store i32 100, i32* [[B_ON_STACK]], align 4 +// CK1-NEXT: store i32 1000, i32* [[C]], align 4 +// CK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS7]], i64 0, i64 0 +// CK1-NEXT: [[TMP10:%.*]] = bitcast i32* [[B_ON_STACK]] to i8* +// CK1-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 8 +// CK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS7]], i64 0, i64 1 +// CK1-NEXT: [[TMP12:%.*]] = bitcast i32* [[A_ON_STACK]] to i8* +// CK1-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 8 +// CK1-NEXT: [[TMP13:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS7]] to i8** +// CK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP13]], i64 2) +// CK1-NEXT: call void @__kmpc_free_shared(i8* [[B]]) +// CK1-NEXT: call void @__kmpc_free_shared(i8* [[A]]) +// CK1-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CK1: .termination.notifier: +// CK1-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CK1-NEXT: br label [[DOTEXIT]] +// CK1: .exit: +// CK1-NEXT: ret void +// +// +// CK1-LABEL: define {{[^@]+}}@__omp_outlined__ +// CK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR1]] { +// CK1-NEXT: entry: +// CK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CK1-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CK1-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CK1-NEXT: store i32 1000, i32* [[TMP0]], align 4 +// CK1-NEXT: ret void +// +// +// CK1-LABEL: define {{[^@]+}}@__omp_outlined___wrapper +// CK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CK1-NEXT: entry: +// CK1-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CK1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +// CK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CK1-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CK1-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CK1-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8 +// CK1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 0 +// CK1-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** +// CK1-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 8 +// CK1-NEXT: call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]] +// CK1-NEXT: ret void +// +// +// CK1-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[B:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR1]] { +// CK1-NEXT: entry: +// CK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CK1-NEXT: [[B_ADDR:%.*]] = alloca i32*, align 8 +// CK1-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CK1-NEXT: [[C:%.*]] = alloca i32, align 4 +// CK1-NEXT: [[C1:%.*]] = alloca i32*, align 8 +// CK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CK1-NEXT: store i32* [[B]], i32** [[B_ADDR]], align 8 +// CK1-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[B_ADDR]], align 8 +// CK1-NEXT: [[TMP1:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CK1-NEXT: store i32* [[C]], i32** [[C1]], align 8 +// CK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], 10000 +// CK1-NEXT: store i32 [[ADD]], i32* [[TMP0]], align 4 +// CK1-NEXT: ret void +// +// +// CK1-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper +// CK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CK1-NEXT: entry: +// CK1-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CK1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +// CK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CK1-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CK1-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CK1-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8 +// CK1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 0 +// CK1-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** +// CK1-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 8 +// CK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 1 +// CK1-NEXT: [[TMP7:%.*]] = bitcast i8** [[TMP6]] to i32** +// CK1-NEXT: [[TMP8:%.*]] = load i32*, i32** [[TMP7]], align 8 +// CK1-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]], i32* [[TMP8]]) #[[ATTR3]] +// CK1-NEXT: ret void // diff --git a/clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp b/clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp --- a/clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp +++ b/clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp @@ -2,21 +2,15 @@ // Test target codegen - host bc file has to be created first. // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK1 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK2 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK3 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK4 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK5 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK6 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK2 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK3 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK7 -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK8 +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK4 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK9 -// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK10 -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK11 -// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK12 +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK5 +// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK6 // expected-no-diagnostics #ifndef HEADER @@ -36,1999 +30,6 @@ } #endif -// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l31 -// CHECK1-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i64 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 -// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 8 -// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[ARGC_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 8 -// CHECK1-NEXT: [[ARGC_CASTED:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK1-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 -// CHECK1-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 8 -// CHECK1-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 -// CHECK1-NEXT: store i64 [[ARGC]], i64* [[ARGC_ADDR]], align 8 -// CHECK1-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 -// CHECK1-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 8 -// CHECK1-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 8 -// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[ARGC_ADDR]] to i32* -// CHECK1-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 8 -// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK1: .execute: -// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) -// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[ARGC_CASTED]] to i32* -// CHECK1-NEXT: store i32 [[TMP5]], i32* [[CONV1]], align 4 -// CHECK1-NEXT: [[TMP6:%.*]] = load i64, i64* [[ARGC_CASTED]], align 8 -// CHECK1-NEXT: store i32 [[TMP4]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK1-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]], [10 x i32]* [[TMP1]], i32* [[TMP2]], i64 [[TMP6]], [10 x i32]* [[TMP3]]) #[[ATTR4:[0-9]+]] -// CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK1: .omp.deinit: -// CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK1-NEXT: br label [[DOTEXIT:%.*]] -// CHECK1: .exit: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i64 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 -// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 8 -// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[ARGC_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 8 -// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[B4:%.*]] = alloca [10 x i32], align 4 -// CHECK1-NEXT: [[I5:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [7 x i8*], align 8 -// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 -// CHECK1-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 8 -// CHECK1-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 -// CHECK1-NEXT: store i64 [[ARGC]], i64* [[ARGC_ADDR]], align 8 -// CHECK1-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 -// CHECK1-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 8 -// CHECK1-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 8 -// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[ARGC_ADDR]] to i32* -// CHECK1-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 8 -// CHECK1-NEXT: [[TMP4:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK1-NEXT: [[TMP5:%.*]] = load i64, i64* @"_openmp_static_kernel$size", align 8 -// CHECK1-NEXT: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i64 [[TMP5]], i16 [[TMP4]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) -// CHECK1-NEXT: [[TMP6:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 8 -// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[TMP6]], i64 0 -// CHECK1-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP7]] to %struct._globalized_locals_ty* -// CHECK1-NEXT: [[C1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP8]], i32 0, i32 0 -// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: store i32 [[TMP9]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP10]], 0 -// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK1-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK1-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK1-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP11]] -// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK1: omp.precond.then: -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK1-NEXT: store i32 [[TMP12]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[TMP13:%.*]] = bitcast [10 x i32]* [[B4]] to i8* -// CHECK1-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* -// CHECK1-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP13]], i8* align 4 [[TMP14]], i64 40, i1 false) -// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: [[TMP15:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4 -// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP16]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK1-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK1-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]] -// CHECK1-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK1: cond.true: -// CHECK1-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK1-NEXT: br label [[COND_END:%.*]] -// CHECK1: cond.false: -// CHECK1-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: br label [[COND_END]] -// CHECK1: cond.end: -// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ [[TMP19]], [[COND_TRUE]] ], [ [[TMP20]], [[COND_FALSE]] ] -// CHECK1-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK1: omp.inner.for.cond: -// CHECK1-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP23]], 1 -// CHECK1-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP22]], [[ADD]] -// CHECK1-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK1: omp.inner.for.body: -// CHECK1-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: [[TMP25:%.*]] = zext i32 [[TMP24]] to i64 -// CHECK1-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP27:%.*]] = zext i32 [[TMP26]] to i64 -// CHECK1-NEXT: [[TMP28:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP29:%.*]] = inttoptr i64 [[TMP25]] to i8* -// CHECK1-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 8 -// CHECK1-NEXT: [[TMP30:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP31:%.*]] = inttoptr i64 [[TMP27]] to i8* -// CHECK1-NEXT: store i8* [[TMP31]], i8** [[TMP30]], align 8 -// CHECK1-NEXT: [[TMP32:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK1-NEXT: [[TMP33:%.*]] = bitcast i32* [[CONV]] to i8* -// CHECK1-NEXT: store i8* [[TMP33]], i8** [[TMP32]], align 8 -// CHECK1-NEXT: [[TMP34:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 -// CHECK1-NEXT: [[TMP35:%.*]] = bitcast i32* [[TMP2]] to i8* -// CHECK1-NEXT: store i8* [[TMP35]], i8** [[TMP34]], align 8 -// CHECK1-NEXT: [[TMP36:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 4 -// CHECK1-NEXT: [[TMP37:%.*]] = bitcast [10 x i32]* [[B4]] to i8* -// CHECK1-NEXT: store i8* [[TMP37]], i8** [[TMP36]], align 8 -// CHECK1-NEXT: [[TMP38:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 5 -// CHECK1-NEXT: [[TMP39:%.*]] = bitcast [10 x i32]* [[C1]] to i8* -// CHECK1-NEXT: store i8* [[TMP39]], i8** [[TMP38]], align 8 -// CHECK1-NEXT: [[TMP40:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 6 -// CHECK1-NEXT: [[TMP41:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* -// CHECK1-NEXT: store i8* [[TMP41]], i8** [[TMP40]], align 8 -// CHECK1-NEXT: [[TMP42:%.*]] = load i32, i32* [[TMP2]], align 4 -// CHECK1-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP42]], 0 -// CHECK1-NEXT: [[TMP43:%.*]] = zext i1 [[TOBOOL]] to i32 -// CHECK1-NEXT: [[TMP44:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP45:%.*]] = load i32, i32* [[TMP44]], align 4 -// CHECK1-NEXT: [[TMP46:%.*]] = bitcast [7 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP45]], i32 [[TMP43]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i32*, i32*, [10 x i32]*, [10 x i32]*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP46]], i64 7) -// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK1: omp.inner.for.inc: -// CHECK1-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP47]], [[TMP48]] -// CHECK1-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP49:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP49]], [[TMP50]] -// CHECK1-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP52:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP51]], [[TMP52]] -// CHECK1-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP53:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP54:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK1-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[TMP53]], [[TMP54]] -// CHECK1-NEXT: br i1 [[CMP11]], label [[COND_TRUE12:%.*]], label [[COND_FALSE13:%.*]] -// CHECK1: cond.true12: -// CHECK1-NEXT: [[TMP55:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK1-NEXT: br label [[COND_END14:%.*]] -// CHECK1: cond.false13: -// CHECK1-NEXT: [[TMP56:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: br label [[COND_END14]] -// CHECK1: cond.end14: -// CHECK1-NEXT: [[COND15:%.*]] = phi i32 [ [[TMP55]], [[COND_TRUE12]] ], [ [[TMP56]], [[COND_FALSE13]] ] -// CHECK1-NEXT: store i32 [[COND15]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP57:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: store i32 [[TMP57]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK1: omp.inner.for.end: -// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK1: omp.loop.exit: -// CHECK1-NEXT: [[TMP58:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP59:%.*]] = load i32, i32* [[TMP58]], align 4 -// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP59]]) -// CHECK1-NEXT: [[TMP60:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[TMP61:%.*]] = icmp ne i32 [[TMP60]], 0 -// CHECK1-NEXT: br i1 [[TMP61]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK1: .omp.lastprivate.then: -// CHECK1-NEXT: [[TMP62:%.*]] = bitcast [10 x i32]* [[TMP1]] to i8* -// CHECK1-NEXT: [[TMP63:%.*]] = bitcast [10 x i32]* [[C1]] to i8* -// CHECK1-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP62]], i8* align 4 [[TMP63]], i64 40, i1 false) -// CHECK1-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK1: .omp.lastprivate.done: -// CHECK1-NEXT: br label [[OMP_PRECOND_END]] -// CHECK1: omp.precond.end: -// CHECK1-NEXT: [[TMP64:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK1-NEXT: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[TMP64]]) -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 -// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 8 -// CHECK1-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 8 -// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[B4:%.*]] = alloca [10 x i32], align 4 -// CHECK1-NEXT: [[C5:%.*]] = alloca [10 x i32], align 4 -// CHECK1-NEXT: [[I6:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 8 -// CHECK1-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 -// CHECK1-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 -// CHECK1-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 8 -// CHECK1-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 8 -// CHECK1-NEXT: [[TMP1:%.*]] = load i32*, i32** [[A_ADDR]], align 8 -// CHECK1-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 -// CHECK1-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 8 -// CHECK1-NEXT: [[TMP4:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 8 -// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP0]], align 4 -// CHECK1-NEXT: store i32 [[TMP5]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP6]], 0 -// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK1-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK1-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK1-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP7]] -// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK1: omp.precond.then: -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK1-NEXT: store i32 [[TMP8]], i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: [[TMP9:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK1-NEXT: [[CONV:%.*]] = trunc i64 [[TMP9]] to i32 -// CHECK1-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK1-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP10]] to i32 -// CHECK1-NEXT: store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[B4]] to i8* -// CHECK1-NEXT: [[TMP12:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8* -// CHECK1-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP11]], i8* align 4 [[TMP12]], i64 40, i1 false) -// CHECK1-NEXT: [[TMP13:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4 -// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP14]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: store i32 [[TMP15]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK1: omp.inner.for.cond: -// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[CONV7:%.*]] = sext i32 [[TMP16]] to i64 -// CHECK1-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK1-NEXT: [[CMP8:%.*]] = icmp ule i64 [[CONV7]], [[TMP17]] -// CHECK1-NEXT: br i1 [[CMP8]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK1: omp.inner.for.body: -// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 -// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK1-NEXT: store i32 [[ADD]], i32* [[I6]], align 4 -// CHECK1-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I6]]) #[[ATTR5:[0-9]+]] -// CHECK1-NEXT: [[CALL9:%.*]] = call i32 @_Z3fooPi(i32* [[TMP1]]) #[[ATTR5]] -// CHECK1-NEXT: [[ADD10:%.*]] = add nsw i32 [[CALL]], [[CALL9]] -// CHECK1-NEXT: [[TMP19:%.*]] = load i32, i32* [[I6]], align 4 -// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64 -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B4]], i64 0, i64 [[IDXPROM]] -// CHECK1-NEXT: [[CALL11:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX]]) #[[ATTR5]] -// CHECK1-NEXT: [[ADD12:%.*]] = add nsw i32 [[ADD10]], [[CALL11]] -// CHECK1-NEXT: [[TMP20:%.*]] = load i32, i32* [[I6]], align 4 -// CHECK1-NEXT: [[IDXPROM13:%.*]] = sext i32 [[TMP20]] to i64 -// CHECK1-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[C5]], i64 0, i64 [[IDXPROM13]] -// CHECK1-NEXT: [[CALL15:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX14]]) #[[ATTR5]] -// CHECK1-NEXT: [[ADD16:%.*]] = add nsw i32 [[ADD12]], [[CALL15]] -// CHECK1-NEXT: [[TMP21:%.*]] = load i32, i32* [[I6]], align 4 -// CHECK1-NEXT: [[IDXPROM17:%.*]] = sext i32 [[TMP21]] to i64 -// CHECK1-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP4]], i64 0, i64 [[IDXPROM17]] -// CHECK1-NEXT: [[CALL19:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX18]]) #[[ATTR5]] -// CHECK1-NEXT: [[ADD20:%.*]] = add nsw i32 [[ADD16]], [[CALL19]] -// CHECK1-NEXT: store i32 [[ADD20]], i32* [[TMP1]], align 4 -// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK1: omp.body.continue: -// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK1: omp.inner.for.inc: -// CHECK1-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD21:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK1-NEXT: store i32 [[ADD21]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK1: omp.inner.for.end: -// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK1: omp.loop.exit: -// CHECK1-NEXT: [[TMP24:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP25:%.*]] = load i32, i32* [[TMP24]], align 4 -// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP25]]) -// CHECK1-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0 -// CHECK1-NEXT: br i1 [[TMP27]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK1: .omp.lastprivate.then: -// CHECK1-NEXT: [[TMP28:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* -// CHECK1-NEXT: [[TMP29:%.*]] = bitcast [10 x i32]* [[C5]] to i8* -// CHECK1-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP28]], i8* align 4 [[TMP29]], i64 40, i1 false) -// CHECK1-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK1: .omp.lastprivate.done: -// CHECK1-NEXT: br label [[OMP_PRECOND_END]] -// CHECK1: omp.precond.end: -// CHECK1-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l31 -// CHECK2-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i64 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0:[0-9]+]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 -// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 8 -// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[ARGC_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 8 -// CHECK2-NEXT: [[ARGC_CASTED:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK2-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 -// CHECK2-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 8 -// CHECK2-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 -// CHECK2-NEXT: store i64 [[ARGC]], i64* [[ARGC_ADDR]], align 8 -// CHECK2-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 8 -// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 -// CHECK2-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 8 -// CHECK2-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 8 -// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[ARGC_ADDR]] to i32* -// CHECK2-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 8 -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK2: .execute: -// CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) -// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: [[CONV1:%.*]] = bitcast i64* [[ARGC_CASTED]] to i32* -// CHECK2-NEXT: store i32 [[TMP5]], i32* [[CONV1]], align 4 -// CHECK2-NEXT: [[TMP6:%.*]] = load i64, i64* [[ARGC_CASTED]], align 8 -// CHECK2-NEXT: store i32 [[TMP4]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK2-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]], [10 x i32]* [[TMP1]], i32* [[TMP2]], i64 [[TMP6]], [10 x i32]* [[TMP3]]) #[[ATTR4:[0-9]+]] -// CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK2: .omp.deinit: -// CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK2-NEXT: br label [[DOTEXIT:%.*]] -// CHECK2: .exit: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i64 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 -// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 8 -// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[ARGC_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 8 -// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[B4:%.*]] = alloca [10 x i32], align 4 -// CHECK2-NEXT: [[I5:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [7 x i8*], align 8 -// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 -// CHECK2-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 8 -// CHECK2-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 -// CHECK2-NEXT: store i64 [[ARGC]], i64* [[ARGC_ADDR]], align 8 -// CHECK2-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 8 -// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 -// CHECK2-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 8 -// CHECK2-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 8 -// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[ARGC_ADDR]] to i32* -// CHECK2-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 8 -// CHECK2-NEXT: [[TMP4:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i64 40, i16 1) -// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to %struct._globalized_locals_ty* -// CHECK2-NEXT: [[C1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP5]], i32 0, i32 0 -// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: store i32 [[TMP6]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP7]], 0 -// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK2-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK2-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK2-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP8]] -// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK2: omp.precond.then: -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK2-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[TMP10:%.*]] = bitcast [10 x i32]* [[B4]] to i8* -// CHECK2-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* -// CHECK2-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP10]], i8* align 4 [[TMP11]], i64 40, i1 false) -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: [[TMP12:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4 -// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP13]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK2-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]] -// CHECK2-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK2: cond.true: -// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK2-NEXT: br label [[COND_END:%.*]] -// CHECK2: cond.false: -// CHECK2-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: br label [[COND_END]] -// CHECK2: cond.end: -// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ [[TMP16]], [[COND_TRUE]] ], [ [[TMP17]], [[COND_FALSE]] ] -// CHECK2-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: store i32 [[TMP18]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK2: omp.inner.for.cond: -// CHECK2-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], 1 -// CHECK2-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP19]], [[ADD]] -// CHECK2-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK2: omp.inner.for.body: -// CHECK2-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: [[TMP22:%.*]] = zext i32 [[TMP21]] to i64 -// CHECK2-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP24:%.*]] = zext i32 [[TMP23]] to i64 -// CHECK2-NEXT: [[TMP25:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP26:%.*]] = inttoptr i64 [[TMP22]] to i8* -// CHECK2-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 8 -// CHECK2-NEXT: [[TMP27:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP28:%.*]] = inttoptr i64 [[TMP24]] to i8* -// CHECK2-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 8 -// CHECK2-NEXT: [[TMP29:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK2-NEXT: [[TMP30:%.*]] = bitcast i32* [[CONV]] to i8* -// CHECK2-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 8 -// CHECK2-NEXT: [[TMP31:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 -// CHECK2-NEXT: [[TMP32:%.*]] = bitcast i32* [[TMP2]] to i8* -// CHECK2-NEXT: store i8* [[TMP32]], i8** [[TMP31]], align 8 -// CHECK2-NEXT: [[TMP33:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 4 -// CHECK2-NEXT: [[TMP34:%.*]] = bitcast [10 x i32]* [[B4]] to i8* -// CHECK2-NEXT: store i8* [[TMP34]], i8** [[TMP33]], align 8 -// CHECK2-NEXT: [[TMP35:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 5 -// CHECK2-NEXT: [[TMP36:%.*]] = bitcast [10 x i32]* [[C1]] to i8* -// CHECK2-NEXT: store i8* [[TMP36]], i8** [[TMP35]], align 8 -// CHECK2-NEXT: [[TMP37:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 6 -// CHECK2-NEXT: [[TMP38:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* -// CHECK2-NEXT: store i8* [[TMP38]], i8** [[TMP37]], align 8 -// CHECK2-NEXT: [[TMP39:%.*]] = load i32, i32* [[TMP2]], align 4 -// CHECK2-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP39]], 0 -// CHECK2-NEXT: [[TMP40:%.*]] = zext i1 [[TOBOOL]] to i32 -// CHECK2-NEXT: [[TMP41:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP42:%.*]] = load i32, i32* [[TMP41]], align 4 -// CHECK2-NEXT: [[TMP43:%.*]] = bitcast [7 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP42]], i32 [[TMP40]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i32*, i32*, [10 x i32]*, [10 x i32]*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP43]], i64 7) -// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK2: omp.inner.for.inc: -// CHECK2-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP44]], [[TMP45]] -// CHECK2-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP46]], [[TMP47]] -// CHECK2-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP49:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP48]], [[TMP49]] -// CHECK2-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK2-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[TMP50]], [[TMP51]] -// CHECK2-NEXT: br i1 [[CMP11]], label [[COND_TRUE12:%.*]], label [[COND_FALSE13:%.*]] -// CHECK2: cond.true12: -// CHECK2-NEXT: [[TMP52:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK2-NEXT: br label [[COND_END14:%.*]] -// CHECK2: cond.false13: -// CHECK2-NEXT: [[TMP53:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: br label [[COND_END14]] -// CHECK2: cond.end14: -// CHECK2-NEXT: [[COND15:%.*]] = phi i32 [ [[TMP52]], [[COND_TRUE12]] ], [ [[TMP53]], [[COND_FALSE13]] ] -// CHECK2-NEXT: store i32 [[COND15]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP54:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: store i32 [[TMP54]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK2: omp.inner.for.end: -// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK2: omp.loop.exit: -// CHECK2-NEXT: [[TMP55:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP56:%.*]] = load i32, i32* [[TMP55]], align 4 -// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP56]]) -// CHECK2-NEXT: [[TMP57:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[TMP58:%.*]] = icmp ne i32 [[TMP57]], 0 -// CHECK2-NEXT: br i1 [[TMP58]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK2: .omp.lastprivate.then: -// CHECK2-NEXT: [[TMP59:%.*]] = bitcast [10 x i32]* [[TMP1]] to i8* -// CHECK2-NEXT: [[TMP60:%.*]] = bitcast [10 x i32]* [[C1]] to i8* -// CHECK2-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP59]], i8* align 4 [[TMP60]], i64 40, i1 false) -// CHECK2-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK2: .omp.lastprivate.done: -// CHECK2-NEXT: br label [[OMP_PRECOND_END]] -// CHECK2: omp.precond.end: -// CHECK2-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP4]]) -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 -// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 8 -// CHECK2-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 8 -// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[B4:%.*]] = alloca [10 x i32], align 4 -// CHECK2-NEXT: [[C5:%.*]] = alloca [10 x i32], align 4 -// CHECK2-NEXT: [[I6:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK2-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 8 -// CHECK2-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 -// CHECK2-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 -// CHECK2-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 8 -// CHECK2-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 8 -// CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 8 -// CHECK2-NEXT: [[TMP1:%.*]] = load i32*, i32** [[A_ADDR]], align 8 -// CHECK2-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 -// CHECK2-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 8 -// CHECK2-NEXT: [[TMP4:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 8 -// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP0]], align 4 -// CHECK2-NEXT: store i32 [[TMP5]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP6]], 0 -// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK2-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK2-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK2-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP7]] -// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK2: omp.precond.then: -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK2-NEXT: store i32 [[TMP8]], i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: [[TMP9:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK2-NEXT: [[CONV:%.*]] = trunc i64 [[TMP9]] to i32 -// CHECK2-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK2-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP10]] to i32 -// CHECK2-NEXT: store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4 -// CHECK2-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[B4]] to i8* -// CHECK2-NEXT: [[TMP12:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8* -// CHECK2-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP11]], i8* align 4 [[TMP12]], i64 40, i1 false) -// CHECK2-NEXT: [[TMP13:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4 -// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP14]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK2-NEXT: store i32 [[TMP15]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK2: omp.inner.for.cond: -// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[CONV7:%.*]] = sext i32 [[TMP16]] to i64 -// CHECK2-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK2-NEXT: [[CMP8:%.*]] = icmp ule i64 [[CONV7]], [[TMP17]] -// CHECK2-NEXT: br i1 [[CMP8]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK2: omp.inner.for.body: -// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 -// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK2-NEXT: store i32 [[ADD]], i32* [[I6]], align 4 -// CHECK2-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I6]]) #[[ATTR5:[0-9]+]] -// CHECK2-NEXT: [[CALL9:%.*]] = call i32 @_Z3fooPi(i32* [[TMP1]]) #[[ATTR5]] -// CHECK2-NEXT: [[ADD10:%.*]] = add nsw i32 [[CALL]], [[CALL9]] -// CHECK2-NEXT: [[TMP19:%.*]] = load i32, i32* [[I6]], align 4 -// CHECK2-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64 -// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B4]], i64 0, i64 [[IDXPROM]] -// CHECK2-NEXT: [[CALL11:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX]]) #[[ATTR5]] -// CHECK2-NEXT: [[ADD12:%.*]] = add nsw i32 [[ADD10]], [[CALL11]] -// CHECK2-NEXT: [[TMP20:%.*]] = load i32, i32* [[I6]], align 4 -// CHECK2-NEXT: [[IDXPROM13:%.*]] = sext i32 [[TMP20]] to i64 -// CHECK2-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[C5]], i64 0, i64 [[IDXPROM13]] -// CHECK2-NEXT: [[CALL15:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX14]]) #[[ATTR5]] -// CHECK2-NEXT: [[ADD16:%.*]] = add nsw i32 [[ADD12]], [[CALL15]] -// CHECK2-NEXT: [[TMP21:%.*]] = load i32, i32* [[I6]], align 4 -// CHECK2-NEXT: [[IDXPROM17:%.*]] = sext i32 [[TMP21]] to i64 -// CHECK2-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP4]], i64 0, i64 [[IDXPROM17]] -// CHECK2-NEXT: [[CALL19:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX18]]) #[[ATTR5]] -// CHECK2-NEXT: [[ADD20:%.*]] = add nsw i32 [[ADD16]], [[CALL19]] -// CHECK2-NEXT: store i32 [[ADD20]], i32* [[TMP1]], align 4 -// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK2: omp.body.continue: -// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK2: omp.inner.for.inc: -// CHECK2-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD21:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK2-NEXT: store i32 [[ADD21]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK2: omp.inner.for.end: -// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK2: omp.loop.exit: -// CHECK2-NEXT: [[TMP24:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP25:%.*]] = load i32, i32* [[TMP24]], align 4 -// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP25]]) -// CHECK2-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0 -// CHECK2-NEXT: br i1 [[TMP27]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK2: .omp.lastprivate.then: -// CHECK2-NEXT: [[TMP28:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* -// CHECK2-NEXT: [[TMP29:%.*]] = bitcast [10 x i32]* [[C5]] to i8* -// CHECK2-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP28]], i8* align 4 [[TMP29]], i64 40, i1 false) -// CHECK2-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK2: .omp.lastprivate.done: -// CHECK2-NEXT: br label [[OMP_PRECOND_END]] -// CHECK2: omp.precond.end: -// CHECK2-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l31 -// CHECK3-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK3-NEXT: [[ARGC_CASTED:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK3-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK3-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 -// CHECK3-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 -// CHECK3-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 -// CHECK3-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK3-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 -// CHECK3-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 -// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK3: .execute: -// CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) -// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 -// CHECK3-NEXT: store i32 [[TMP5]], i32* [[ARGC_CASTED]], align 4 -// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARGC_CASTED]], align 4 -// CHECK3-NEXT: store i32 [[TMP4]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK3-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]], [10 x i32]* [[TMP1]], i32* [[TMP2]], i32 [[TMP6]], [10 x i32]* [[TMP3]]) #[[ATTR4:[0-9]+]] -// CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK3: .omp.deinit: -// CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK3-NEXT: br label [[DOTEXIT:%.*]] -// CHECK3: .exit: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[B4:%.*]] = alloca [10 x i32], align 4 -// CHECK3-NEXT: [[I5:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [7 x i8*], align 4 -// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK3-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK3-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 -// CHECK3-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 -// CHECK3-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 -// CHECK3-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK3-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 -// CHECK3-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 -// CHECK3-NEXT: [[TMP4:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* @"_openmp_static_kernel$size", align 4 -// CHECK3-NEXT: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP5]], i16 [[TMP4]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) -// CHECK3-NEXT: [[TMP6:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4 -// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[TMP6]], i32 0 -// CHECK3-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP7]] to %struct._globalized_locals_ty* -// CHECK3-NEXT: [[C1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP8]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 -// CHECK3-NEXT: store i32 [[TMP9]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP10]], 0 -// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK3-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK3-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP11]] -// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK3: omp.precond.then: -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK3-NEXT: store i32 [[TMP12]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[TMP13:%.*]] = bitcast [10 x i32]* [[B4]] to i8* -// CHECK3-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* -// CHECK3-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP13]], i8* align 4 [[TMP14]], i32 40, i1 false) -// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: [[TMP15:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4 -// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP16]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK3-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK3-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]] -// CHECK3-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK3: cond.true: -// CHECK3-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK3-NEXT: br label [[COND_END:%.*]] -// CHECK3: cond.false: -// CHECK3-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: br label [[COND_END]] -// CHECK3: cond.end: -// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ [[TMP19]], [[COND_TRUE]] ], [ [[TMP20]], [[COND_FALSE]] ] -// CHECK3-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK3: omp.inner.for.cond: -// CHECK3-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP23]], 1 -// CHECK3-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP22]], [[ADD]] -// CHECK3-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK3: omp.inner.for.body: -// CHECK3-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP26:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP27:%.*]] = inttoptr i32 [[TMP24]] to i8* -// CHECK3-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 4 -// CHECK3-NEXT: [[TMP28:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP29:%.*]] = inttoptr i32 [[TMP25]] to i8* -// CHECK3-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 4 -// CHECK3-NEXT: [[TMP30:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK3-NEXT: [[TMP31:%.*]] = bitcast i32* [[ARGC_ADDR]] to i8* -// CHECK3-NEXT: store i8* [[TMP31]], i8** [[TMP30]], align 4 -// CHECK3-NEXT: [[TMP32:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK3-NEXT: [[TMP33:%.*]] = bitcast i32* [[TMP2]] to i8* -// CHECK3-NEXT: store i8* [[TMP33]], i8** [[TMP32]], align 4 -// CHECK3-NEXT: [[TMP34:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 -// CHECK3-NEXT: [[TMP35:%.*]] = bitcast [10 x i32]* [[B4]] to i8* -// CHECK3-NEXT: store i8* [[TMP35]], i8** [[TMP34]], align 4 -// CHECK3-NEXT: [[TMP36:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 5 -// CHECK3-NEXT: [[TMP37:%.*]] = bitcast [10 x i32]* [[C1]] to i8* -// CHECK3-NEXT: store i8* [[TMP37]], i8** [[TMP36]], align 4 -// CHECK3-NEXT: [[TMP38:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 6 -// CHECK3-NEXT: [[TMP39:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* -// CHECK3-NEXT: store i8* [[TMP39]], i8** [[TMP38]], align 4 -// CHECK3-NEXT: [[TMP40:%.*]] = load i32, i32* [[TMP2]], align 4 -// CHECK3-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP40]], 0 -// CHECK3-NEXT: [[TMP41:%.*]] = zext i1 [[TOBOOL]] to i32 -// CHECK3-NEXT: [[TMP42:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: [[TMP43:%.*]] = load i32, i32* [[TMP42]], align 4 -// CHECK3-NEXT: [[TMP44:%.*]] = bitcast [7 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP43]], i32 [[TMP41]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32*, i32*, [10 x i32]*, [10 x i32]*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP44]], i32 7) -// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK3: omp.inner.for.inc: -// CHECK3-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP45]], [[TMP46]] -// CHECK3-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP47]], [[TMP48]] -// CHECK3-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP49:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP49]], [[TMP50]] -// CHECK3-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP52:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK3-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[TMP51]], [[TMP52]] -// CHECK3-NEXT: br i1 [[CMP11]], label [[COND_TRUE12:%.*]], label [[COND_FALSE13:%.*]] -// CHECK3: cond.true12: -// CHECK3-NEXT: [[TMP53:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK3-NEXT: br label [[COND_END14:%.*]] -// CHECK3: cond.false13: -// CHECK3-NEXT: [[TMP54:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: br label [[COND_END14]] -// CHECK3: cond.end14: -// CHECK3-NEXT: [[COND15:%.*]] = phi i32 [ [[TMP53]], [[COND_TRUE12]] ], [ [[TMP54]], [[COND_FALSE13]] ] -// CHECK3-NEXT: store i32 [[COND15]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP55:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP55]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK3: omp.inner.for.end: -// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK3: omp.loop.exit: -// CHECK3-NEXT: [[TMP56:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: [[TMP57:%.*]] = load i32, i32* [[TMP56]], align 4 -// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP57]]) -// CHECK3-NEXT: [[TMP58:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[TMP59:%.*]] = icmp ne i32 [[TMP58]], 0 -// CHECK3-NEXT: br i1 [[TMP59]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK3: .omp.lastprivate.then: -// CHECK3-NEXT: [[TMP60:%.*]] = bitcast [10 x i32]* [[TMP1]] to i8* -// CHECK3-NEXT: [[TMP61:%.*]] = bitcast [10 x i32]* [[C1]] to i8* -// CHECK3-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP60]], i8* align 4 [[TMP61]], i32 40, i1 false) -// CHECK3-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK3: .omp.lastprivate.done: -// CHECK3-NEXT: br label [[OMP_PRECOND_END]] -// CHECK3: omp.precond.end: -// CHECK3-NEXT: [[TMP62:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK3-NEXT: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[TMP62]]) -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK3-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[B3:%.*]] = alloca [10 x i32], align 4 -// CHECK3-NEXT: [[C4:%.*]] = alloca [10 x i32], align 4 -// CHECK3-NEXT: [[I5:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK3-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK3-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK3-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 4 -// CHECK3-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 -// CHECK3-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK3-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 -// CHECK3-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 4 -// CHECK3-NEXT: [[TMP1:%.*]] = load i32*, i32** [[A_ADDR]], align 4 -// CHECK3-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 -// CHECK3-NEXT: [[TMP4:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 -// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP0]], align 4 -// CHECK3-NEXT: store i32 [[TMP5]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP6]], 0 -// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK3-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK3-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP7]] -// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK3: omp.precond.then: -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK3-NEXT: store i32 [[TMP8]], i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK3-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP10]], i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[B3]] to i8* -// CHECK3-NEXT: [[TMP12:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8* -// CHECK3-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP11]], i8* align 4 [[TMP12]], i32 40, i1 false) -// CHECK3-NEXT: [[TMP13:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4 -// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP14]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP15]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK3: omp.inner.for.cond: -// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK3-NEXT: [[CMP6:%.*]] = icmp ule i32 [[TMP16]], [[TMP17]] -// CHECK3-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK3: omp.inner.for.body: -// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 -// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK3-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 -// CHECK3-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I5]]) #[[ATTR5:[0-9]+]] -// CHECK3-NEXT: [[CALL7:%.*]] = call i32 @_Z3fooPi(i32* [[TMP1]]) #[[ATTR5]] -// CHECK3-NEXT: [[ADD8:%.*]] = add nsw i32 [[CALL]], [[CALL7]] -// CHECK3-NEXT: [[TMP19:%.*]] = load i32, i32* [[I5]], align 4 -// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B3]], i32 0, i32 [[TMP19]] -// CHECK3-NEXT: [[CALL9:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX]]) #[[ATTR5]] -// CHECK3-NEXT: [[ADD10:%.*]] = add nsw i32 [[ADD8]], [[CALL9]] -// CHECK3-NEXT: [[TMP20:%.*]] = load i32, i32* [[I5]], align 4 -// CHECK3-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[C4]], i32 0, i32 [[TMP20]] -// CHECK3-NEXT: [[CALL12:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX11]]) #[[ATTR5]] -// CHECK3-NEXT: [[ADD13:%.*]] = add nsw i32 [[ADD10]], [[CALL12]] -// CHECK3-NEXT: [[TMP21:%.*]] = load i32, i32* [[I5]], align 4 -// CHECK3-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP4]], i32 0, i32 [[TMP21]] -// CHECK3-NEXT: [[CALL15:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX14]]) #[[ATTR5]] -// CHECK3-NEXT: [[ADD16:%.*]] = add nsw i32 [[ADD13]], [[CALL15]] -// CHECK3-NEXT: store i32 [[ADD16]], i32* [[TMP1]], align 4 -// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK3: omp.body.continue: -// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK3: omp.inner.for.inc: -// CHECK3-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD17:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK3-NEXT: store i32 [[ADD17]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK3: omp.inner.for.end: -// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK3: omp.loop.exit: -// CHECK3-NEXT: [[TMP24:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: [[TMP25:%.*]] = load i32, i32* [[TMP24]], align 4 -// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP25]]) -// CHECK3-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0 -// CHECK3-NEXT: br i1 [[TMP27]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK3: .omp.lastprivate.then: -// CHECK3-NEXT: [[TMP28:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* -// CHECK3-NEXT: [[TMP29:%.*]] = bitcast [10 x i32]* [[C4]] to i8* -// CHECK3-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP28]], i8* align 4 [[TMP29]], i32 40, i1 false) -// CHECK3-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK3: .omp.lastprivate.done: -// CHECK3-NEXT: br label [[OMP_PRECOND_END]] -// CHECK3: omp.precond.end: -// CHECK3-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l31 -// CHECK4-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0:[0-9]+]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK4-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK4-NEXT: [[ARGC_CASTED:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK4-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK4-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 -// CHECK4-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 -// CHECK4-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 -// CHECK4-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 -// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK4-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 -// CHECK4-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 4 -// CHECK4-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 -// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK4: .execute: -// CHECK4-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) -// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 -// CHECK4-NEXT: store i32 [[TMP5]], i32* [[ARGC_CASTED]], align 4 -// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARGC_CASTED]], align 4 -// CHECK4-NEXT: store i32 [[TMP4]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK4-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]], [10 x i32]* [[TMP1]], i32* [[TMP2]], i32 [[TMP6]], [10 x i32]* [[TMP3]]) #[[ATTR4:[0-9]+]] -// CHECK4-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK4: .omp.deinit: -// CHECK4-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK4-NEXT: br label [[DOTEXIT:%.*]] -// CHECK4: .exit: -// CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK4-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[B4:%.*]] = alloca [10 x i32], align 4 -// CHECK4-NEXT: [[I5:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [7 x i8*], align 4 -// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK4-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK4-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 -// CHECK4-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 -// CHECK4-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 -// CHECK4-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 -// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK4-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 -// CHECK4-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 4 -// CHECK4-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 -// CHECK4-NEXT: [[TMP4:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* @"_openmp_static_kernel$size", align 4 -// CHECK4-NEXT: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP5]], i16 [[TMP4]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) -// CHECK4-NEXT: [[TMP6:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4 -// CHECK4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[TMP6]], i32 0 -// CHECK4-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP7]] to %struct._globalized_locals_ty* -// CHECK4-NEXT: [[C1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP8]], i32 0, i32 0 -// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 -// CHECK4-NEXT: store i32 [[TMP9]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP10]], 0 -// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK4-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK4-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK4-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK4-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP11]] -// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK4: omp.precond.then: -// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK4-NEXT: store i32 [[TMP12]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK4-NEXT: [[TMP13:%.*]] = bitcast [10 x i32]* [[B4]] to i8* -// CHECK4-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* -// CHECK4-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP13]], i8* align 4 [[TMP14]], i32 40, i1 false) -// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK4-NEXT: [[TMP15:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4 -// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP16]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK4-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK4-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]] -// CHECK4-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK4: cond.true: -// CHECK4-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK4-NEXT: br label [[COND_END:%.*]] -// CHECK4: cond.false: -// CHECK4-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: br label [[COND_END]] -// CHECK4: cond.end: -// CHECK4-NEXT: [[COND:%.*]] = phi i32 [ [[TMP19]], [[COND_TRUE]] ], [ [[TMP20]], [[COND_FALSE]] ] -// CHECK4-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK4: omp.inner.for.cond: -// CHECK4-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP23]], 1 -// CHECK4-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP22]], [[ADD]] -// CHECK4-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK4: omp.inner.for.body: -// CHECK4-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP26:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK4-NEXT: [[TMP27:%.*]] = inttoptr i32 [[TMP24]] to i8* -// CHECK4-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 4 -// CHECK4-NEXT: [[TMP28:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK4-NEXT: [[TMP29:%.*]] = inttoptr i32 [[TMP25]] to i8* -// CHECK4-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 4 -// CHECK4-NEXT: [[TMP30:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK4-NEXT: [[TMP31:%.*]] = bitcast i32* [[ARGC_ADDR]] to i8* -// CHECK4-NEXT: store i8* [[TMP31]], i8** [[TMP30]], align 4 -// CHECK4-NEXT: [[TMP32:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK4-NEXT: [[TMP33:%.*]] = bitcast i32* [[TMP2]] to i8* -// CHECK4-NEXT: store i8* [[TMP33]], i8** [[TMP32]], align 4 -// CHECK4-NEXT: [[TMP34:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 -// CHECK4-NEXT: [[TMP35:%.*]] = bitcast [10 x i32]* [[B4]] to i8* -// CHECK4-NEXT: store i8* [[TMP35]], i8** [[TMP34]], align 4 -// CHECK4-NEXT: [[TMP36:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 5 -// CHECK4-NEXT: [[TMP37:%.*]] = bitcast [10 x i32]* [[C1]] to i8* -// CHECK4-NEXT: store i8* [[TMP37]], i8** [[TMP36]], align 4 -// CHECK4-NEXT: [[TMP38:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 6 -// CHECK4-NEXT: [[TMP39:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* -// CHECK4-NEXT: store i8* [[TMP39]], i8** [[TMP38]], align 4 -// CHECK4-NEXT: [[TMP40:%.*]] = load i32, i32* [[TMP2]], align 4 -// CHECK4-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP40]], 0 -// CHECK4-NEXT: [[TMP41:%.*]] = zext i1 [[TOBOOL]] to i32 -// CHECK4-NEXT: [[TMP42:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: [[TMP43:%.*]] = load i32, i32* [[TMP42]], align 4 -// CHECK4-NEXT: [[TMP44:%.*]] = bitcast [7 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP43]], i32 [[TMP41]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32*, i32*, [10 x i32]*, [10 x i32]*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP44]], i32 7) -// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK4: omp.inner.for.inc: -// CHECK4-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP45]], [[TMP46]] -// CHECK4-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP47]], [[TMP48]] -// CHECK4-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: [[TMP49:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP49]], [[TMP50]] -// CHECK4-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP52:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK4-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[TMP51]], [[TMP52]] -// CHECK4-NEXT: br i1 [[CMP11]], label [[COND_TRUE12:%.*]], label [[COND_FALSE13:%.*]] -// CHECK4: cond.true12: -// CHECK4-NEXT: [[TMP53:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK4-NEXT: br label [[COND_END14:%.*]] -// CHECK4: cond.false13: -// CHECK4-NEXT: [[TMP54:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: br label [[COND_END14]] -// CHECK4: cond.end14: -// CHECK4-NEXT: [[COND15:%.*]] = phi i32 [ [[TMP53]], [[COND_TRUE12]] ], [ [[TMP54]], [[COND_FALSE13]] ] -// CHECK4-NEXT: store i32 [[COND15]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP55:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: store i32 [[TMP55]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK4: omp.inner.for.end: -// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK4: omp.loop.exit: -// CHECK4-NEXT: [[TMP56:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: [[TMP57:%.*]] = load i32, i32* [[TMP56]], align 4 -// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP57]]) -// CHECK4-NEXT: [[TMP58:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK4-NEXT: [[TMP59:%.*]] = icmp ne i32 [[TMP58]], 0 -// CHECK4-NEXT: br i1 [[TMP59]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK4: .omp.lastprivate.then: -// CHECK4-NEXT: [[TMP60:%.*]] = bitcast [10 x i32]* [[TMP1]] to i8* -// CHECK4-NEXT: [[TMP61:%.*]] = bitcast [10 x i32]* [[C1]] to i8* -// CHECK4-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP60]], i8* align 4 [[TMP61]], i32 40, i1 false) -// CHECK4-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK4: .omp.lastprivate.done: -// CHECK4-NEXT: br label [[OMP_PRECOND_END]] -// CHECK4: omp.precond.end: -// CHECK4-NEXT: [[TMP62:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK4-NEXT: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[TMP62]]) -// CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK4-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK4-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[B3:%.*]] = alloca [10 x i32], align 4 -// CHECK4-NEXT: [[C4:%.*]] = alloca [10 x i32], align 4 -// CHECK4-NEXT: [[I5:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK4-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK4-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK4-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 4 -// CHECK4-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 -// CHECK4-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK4-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 -// CHECK4-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 -// CHECK4-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 4 -// CHECK4-NEXT: [[TMP1:%.*]] = load i32*, i32** [[A_ADDR]], align 4 -// CHECK4-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK4-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 -// CHECK4-NEXT: [[TMP4:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 -// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP0]], align 4 -// CHECK4-NEXT: store i32 [[TMP5]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP6]], 0 -// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK4-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK4-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK4-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP7]] -// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK4: omp.precond.then: -// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK4-NEXT: store i32 [[TMP8]], i32* [[DOTOMP_UB]], align 4 -// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK4-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_LB]], align 4 -// CHECK4-NEXT: store i32 [[TMP10]], i32* [[DOTOMP_UB]], align 4 -// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK4-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[B3]] to i8* -// CHECK4-NEXT: [[TMP12:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8* -// CHECK4-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP11]], i8* align 4 [[TMP12]], i32 40, i1 false) -// CHECK4-NEXT: [[TMP13:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4 -// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP14]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK4-NEXT: store i32 [[TMP15]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK4: omp.inner.for.cond: -// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK4-NEXT: [[CMP6:%.*]] = icmp ule i32 [[TMP16]], [[TMP17]] -// CHECK4-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK4: omp.inner.for.body: -// CHECK4-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 -// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK4-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 -// CHECK4-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I5]]) #[[ATTR5:[0-9]+]] -// CHECK4-NEXT: [[CALL7:%.*]] = call i32 @_Z3fooPi(i32* [[TMP1]]) #[[ATTR5]] -// CHECK4-NEXT: [[ADD8:%.*]] = add nsw i32 [[CALL]], [[CALL7]] -// CHECK4-NEXT: [[TMP19:%.*]] = load i32, i32* [[I5]], align 4 -// CHECK4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B3]], i32 0, i32 [[TMP19]] -// CHECK4-NEXT: [[CALL9:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX]]) #[[ATTR5]] -// CHECK4-NEXT: [[ADD10:%.*]] = add nsw i32 [[ADD8]], [[CALL9]] -// CHECK4-NEXT: [[TMP20:%.*]] = load i32, i32* [[I5]], align 4 -// CHECK4-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[C4]], i32 0, i32 [[TMP20]] -// CHECK4-NEXT: [[CALL12:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX11]]) #[[ATTR5]] -// CHECK4-NEXT: [[ADD13:%.*]] = add nsw i32 [[ADD10]], [[CALL12]] -// CHECK4-NEXT: [[TMP21:%.*]] = load i32, i32* [[I5]], align 4 -// CHECK4-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP4]], i32 0, i32 [[TMP21]] -// CHECK4-NEXT: [[CALL15:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX14]]) #[[ATTR5]] -// CHECK4-NEXT: [[ADD16:%.*]] = add nsw i32 [[ADD13]], [[CALL15]] -// CHECK4-NEXT: store i32 [[ADD16]], i32* [[TMP1]], align 4 -// CHECK4-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK4: omp.body.continue: -// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK4: omp.inner.for.inc: -// CHECK4-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: [[ADD17:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK4-NEXT: store i32 [[ADD17]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK4: omp.inner.for.end: -// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK4: omp.loop.exit: -// CHECK4-NEXT: [[TMP24:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: [[TMP25:%.*]] = load i32, i32* [[TMP24]], align 4 -// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP25]]) -// CHECK4-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK4-NEXT: [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0 -// CHECK4-NEXT: br i1 [[TMP27]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK4: .omp.lastprivate.then: -// CHECK4-NEXT: [[TMP28:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* -// CHECK4-NEXT: [[TMP29:%.*]] = bitcast [10 x i32]* [[C4]] to i8* -// CHECK4-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP28]], i8* align 4 [[TMP29]], i32 40, i1 false) -// CHECK4-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK4: .omp.lastprivate.done: -// CHECK4-NEXT: br label [[OMP_PRECOND_END]] -// CHECK4: omp.precond.end: -// CHECK4-NEXT: ret void -// -// -// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l31 -// CHECK5-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0:[0-9]+]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK5-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK5-NEXT: [[ARGC_CASTED:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK5-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK5-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 -// CHECK5-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 -// CHECK5-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 -// CHECK5-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK5-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 -// CHECK5-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 4 -// CHECK5-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 -// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK5-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK5-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK5: .execute: -// CHECK5-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) -// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP5]], i32* [[ARGC_CASTED]], align 4 -// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARGC_CASTED]], align 4 -// CHECK5-NEXT: store i32 [[TMP4]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK5-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]], [10 x i32]* [[TMP1]], i32* [[TMP2]], i32 [[TMP6]], [10 x i32]* [[TMP3]]) #[[ATTR4:[0-9]+]] -// CHECK5-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK5: .omp.deinit: -// CHECK5-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK5-NEXT: br label [[DOTEXIT:%.*]] -// CHECK5: .exit: -// CHECK5-NEXT: ret void -// -// -// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK5-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[B4:%.*]] = alloca [10 x i32], align 4 -// CHECK5-NEXT: [[I5:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [7 x i8*], align 4 -// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK5-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK5-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 -// CHECK5-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 -// CHECK5-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 -// CHECK5-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK5-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 -// CHECK5-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 4 -// CHECK5-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 -// CHECK5-NEXT: [[TMP4:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 40, i16 1) -// CHECK5-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to %struct._globalized_locals_ty* -// CHECK5-NEXT: [[C1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP5]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP6]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK5-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP7]], 0 -// CHECK5-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK5-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK5-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK5-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK5-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP8]] -// CHECK5-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK5: omp.precond.then: -// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK5-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK5-NEXT: [[TMP10:%.*]] = bitcast [10 x i32]* [[B4]] to i8* -// CHECK5-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* -// CHECK5-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP10]], i8* align 4 [[TMP11]], i32 40, i1 false) -// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK5-NEXT: [[TMP12:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4 -// CHECK5-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP13]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK5-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK5-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]] -// CHECK5-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK5: cond.true: -// CHECK5-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK5-NEXT: br label [[COND_END:%.*]] -// CHECK5: cond.false: -// CHECK5-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: br label [[COND_END]] -// CHECK5: cond.end: -// CHECK5-NEXT: [[COND:%.*]] = phi i32 [ [[TMP16]], [[COND_TRUE]] ], [ [[TMP17]], [[COND_FALSE]] ] -// CHECK5-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK5-NEXT: store i32 [[TMP18]], i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK5: omp.inner.for.cond: -// CHECK5-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], 1 -// CHECK5-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP19]], [[ADD]] -// CHECK5-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK5: omp.inner.for.body: -// CHECK5-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK5-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[TMP23:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP21]] to i8* -// CHECK5-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 -// CHECK5-NEXT: [[TMP25:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP26:%.*]] = inttoptr i32 [[TMP22]] to i8* -// CHECK5-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 -// CHECK5-NEXT: [[TMP27:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK5-NEXT: [[TMP28:%.*]] = bitcast i32* [[ARGC_ADDR]] to i8* -// CHECK5-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 -// CHECK5-NEXT: [[TMP29:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK5-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP2]] to i8* -// CHECK5-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 4 -// CHECK5-NEXT: [[TMP31:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 -// CHECK5-NEXT: [[TMP32:%.*]] = bitcast [10 x i32]* [[B4]] to i8* -// CHECK5-NEXT: store i8* [[TMP32]], i8** [[TMP31]], align 4 -// CHECK5-NEXT: [[TMP33:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 5 -// CHECK5-NEXT: [[TMP34:%.*]] = bitcast [10 x i32]* [[C1]] to i8* -// CHECK5-NEXT: store i8* [[TMP34]], i8** [[TMP33]], align 4 -// CHECK5-NEXT: [[TMP35:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 6 -// CHECK5-NEXT: [[TMP36:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* -// CHECK5-NEXT: store i8* [[TMP36]], i8** [[TMP35]], align 4 -// CHECK5-NEXT: [[TMP37:%.*]] = load i32, i32* [[TMP2]], align 4 -// CHECK5-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP37]], 0 -// CHECK5-NEXT: [[TMP38:%.*]] = zext i1 [[TOBOOL]] to i32 -// CHECK5-NEXT: [[TMP39:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: [[TMP40:%.*]] = load i32, i32* [[TMP39]], align 4 -// CHECK5-NEXT: [[TMP41:%.*]] = bitcast [7 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP40]], i32 [[TMP38]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32*, i32*, [10 x i32]*, [10 x i32]*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP41]], i32 7) -// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK5: omp.inner.for.inc: -// CHECK5-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK5-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP42]], [[TMP43]] -// CHECK5-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK5-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK5-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP44]], [[TMP45]] -// CHECK5-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK5-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK5-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP46]], [[TMP47]] -// CHECK5-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[TMP49:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK5-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[TMP48]], [[TMP49]] -// CHECK5-NEXT: br i1 [[CMP11]], label [[COND_TRUE12:%.*]], label [[COND_FALSE13:%.*]] -// CHECK5: cond.true12: -// CHECK5-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK5-NEXT: br label [[COND_END14:%.*]] -// CHECK5: cond.false13: -// CHECK5-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: br label [[COND_END14]] -// CHECK5: cond.end14: -// CHECK5-NEXT: [[COND15:%.*]] = phi i32 [ [[TMP50]], [[COND_TRUE12]] ], [ [[TMP51]], [[COND_FALSE13]] ] -// CHECK5-NEXT: store i32 [[COND15]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[TMP52:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK5-NEXT: store i32 [[TMP52]], i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK5: omp.inner.for.end: -// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK5: omp.loop.exit: -// CHECK5-NEXT: [[TMP53:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: [[TMP54:%.*]] = load i32, i32* [[TMP53]], align 4 -// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP54]]) -// CHECK5-NEXT: [[TMP55:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK5-NEXT: [[TMP56:%.*]] = icmp ne i32 [[TMP55]], 0 -// CHECK5-NEXT: br i1 [[TMP56]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK5: .omp.lastprivate.then: -// CHECK5-NEXT: [[TMP57:%.*]] = bitcast [10 x i32]* [[TMP1]] to i8* -// CHECK5-NEXT: [[TMP58:%.*]] = bitcast [10 x i32]* [[C1]] to i8* -// CHECK5-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP57]], i8* align 4 [[TMP58]], i32 40, i1 false) -// CHECK5-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK5: .omp.lastprivate.done: -// CHECK5-NEXT: br label [[OMP_PRECOND_END]] -// CHECK5: omp.precond.end: -// CHECK5-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP4]]) -// CHECK5-NEXT: ret void -// -// -// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK5-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK5-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[B3:%.*]] = alloca [10 x i32], align 4 -// CHECK5-NEXT: [[C4:%.*]] = alloca [10 x i32], align 4 -// CHECK5-NEXT: [[I5:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK5-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK5-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 4 -// CHECK5-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 -// CHECK5-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK5-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 -// CHECK5-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 -// CHECK5-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 4 -// CHECK5-NEXT: [[TMP1:%.*]] = load i32*, i32** [[A_ADDR]], align 4 -// CHECK5-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK5-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 -// CHECK5-NEXT: [[TMP4:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 -// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP0]], align 4 -// CHECK5-NEXT: store i32 [[TMP5]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK5-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP6]], 0 -// CHECK5-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK5-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK5-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK5-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK5-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP7]] -// CHECK5-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK5: omp.precond.then: -// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK5-NEXT: store i32 [[TMP8]], i32* [[DOTOMP_UB]], align 4 -// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_LB]], align 4 -// CHECK5-NEXT: store i32 [[TMP10]], i32* [[DOTOMP_UB]], align 4 -// CHECK5-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK5-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[B3]] to i8* -// CHECK5-NEXT: [[TMP12:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8* -// CHECK5-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP11]], i8* align 4 [[TMP12]], i32 40, i1 false) -// CHECK5-NEXT: [[TMP13:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4 -// CHECK5-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP14]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK5-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK5-NEXT: store i32 [[TMP15]], i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK5: omp.inner.for.cond: -// CHECK5-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK5-NEXT: [[CMP6:%.*]] = icmp ule i32 [[TMP16]], [[TMP17]] -// CHECK5-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK5: omp.inner.for.body: -// CHECK5-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 -// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK5-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 -// CHECK5-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I5]]) #[[ATTR5:[0-9]+]] -// CHECK5-NEXT: [[CALL7:%.*]] = call i32 @_Z3fooPi(i32* [[TMP1]]) #[[ATTR5]] -// CHECK5-NEXT: [[ADD8:%.*]] = add nsw i32 [[CALL]], [[CALL7]] -// CHECK5-NEXT: [[TMP19:%.*]] = load i32, i32* [[I5]], align 4 -// CHECK5-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B3]], i32 0, i32 [[TMP19]] -// CHECK5-NEXT: [[CALL9:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX]]) #[[ATTR5]] -// CHECK5-NEXT: [[ADD10:%.*]] = add nsw i32 [[ADD8]], [[CALL9]] -// CHECK5-NEXT: [[TMP20:%.*]] = load i32, i32* [[I5]], align 4 -// CHECK5-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[C4]], i32 0, i32 [[TMP20]] -// CHECK5-NEXT: [[CALL12:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX11]]) #[[ATTR5]] -// CHECK5-NEXT: [[ADD13:%.*]] = add nsw i32 [[ADD10]], [[CALL12]] -// CHECK5-NEXT: [[TMP21:%.*]] = load i32, i32* [[I5]], align 4 -// CHECK5-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP4]], i32 0, i32 [[TMP21]] -// CHECK5-NEXT: [[CALL15:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX14]]) #[[ATTR5]] -// CHECK5-NEXT: [[ADD16:%.*]] = add nsw i32 [[ADD13]], [[CALL15]] -// CHECK5-NEXT: store i32 [[ADD16]], i32* [[TMP1]], align 4 -// CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK5: omp.body.continue: -// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK5: omp.inner.for.inc: -// CHECK5-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK5-NEXT: [[ADD17:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK5-NEXT: store i32 [[ADD17]], i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK5: omp.inner.for.end: -// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK5: omp.loop.exit: -// CHECK5-NEXT: [[TMP24:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: [[TMP25:%.*]] = load i32, i32* [[TMP24]], align 4 -// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP25]]) -// CHECK5-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK5-NEXT: [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0 -// CHECK5-NEXT: br i1 [[TMP27]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK5: .omp.lastprivate.then: -// CHECK5-NEXT: [[TMP28:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* -// CHECK5-NEXT: [[TMP29:%.*]] = bitcast [10 x i32]* [[C4]] to i8* -// CHECK5-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP28]], i8* align 4 [[TMP29]], i32 40, i1 false) -// CHECK5-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK5: .omp.lastprivate.done: -// CHECK5-NEXT: br label [[OMP_PRECOND_END]] -// CHECK5: omp.precond.end: -// CHECK5-NEXT: ret void -// -// -// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l31 -// CHECK6-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0:[0-9]+]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK6-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK6-NEXT: [[ARGC_CASTED:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK6-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK6-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 -// CHECK6-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 -// CHECK6-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 -// CHECK6-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK6-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 -// CHECK6-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 4 -// CHECK6-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 -// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK6-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK6-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK6: .execute: -// CHECK6-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) -// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP5]], i32* [[ARGC_CASTED]], align 4 -// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARGC_CASTED]], align 4 -// CHECK6-NEXT: store i32 [[TMP4]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK6-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]], [10 x i32]* [[TMP1]], i32* [[TMP2]], i32 [[TMP6]], [10 x i32]* [[TMP3]]) #[[ATTR4:[0-9]+]] -// CHECK6-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK6: .omp.deinit: -// CHECK6-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK6-NEXT: br label [[DOTEXIT:%.*]] -// CHECK6: .exit: -// CHECK6-NEXT: ret void -// -// -// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK6-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[B4:%.*]] = alloca [10 x i32], align 4 -// CHECK6-NEXT: [[I5:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [7 x i8*], align 4 -// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK6-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK6-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 -// CHECK6-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 -// CHECK6-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 -// CHECK6-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK6-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 -// CHECK6-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 4 -// CHECK6-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 -// CHECK6-NEXT: [[TMP4:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 40, i16 1) -// CHECK6-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to %struct._globalized_locals_ty* -// CHECK6-NEXT: [[C1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP5]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP6]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK6-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP7]], 0 -// CHECK6-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK6-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK6-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK6-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK6-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK6-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP8]] -// CHECK6-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK6: omp.precond.then: -// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK6-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK6-NEXT: [[TMP10:%.*]] = bitcast [10 x i32]* [[B4]] to i8* -// CHECK6-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* -// CHECK6-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP10]], i8* align 4 [[TMP11]], i32 40, i1 false) -// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK6-NEXT: [[TMP12:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4 -// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP13]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK6-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK6-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]] -// CHECK6-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK6: cond.true: -// CHECK6-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK6-NEXT: br label [[COND_END:%.*]] -// CHECK6: cond.false: -// CHECK6-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: br label [[COND_END]] -// CHECK6: cond.end: -// CHECK6-NEXT: [[COND:%.*]] = phi i32 [ [[TMP16]], [[COND_TRUE]] ], [ [[TMP17]], [[COND_FALSE]] ] -// CHECK6-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK6-NEXT: store i32 [[TMP18]], i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK6: omp.inner.for.cond: -// CHECK6-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], 1 -// CHECK6-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP19]], [[ADD]] -// CHECK6-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK6: omp.inner.for.body: -// CHECK6-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK6-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[TMP23:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP21]] to i8* -// CHECK6-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 -// CHECK6-NEXT: [[TMP25:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP26:%.*]] = inttoptr i32 [[TMP22]] to i8* -// CHECK6-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 -// CHECK6-NEXT: [[TMP27:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK6-NEXT: [[TMP28:%.*]] = bitcast i32* [[ARGC_ADDR]] to i8* -// CHECK6-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 -// CHECK6-NEXT: [[TMP29:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK6-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP2]] to i8* -// CHECK6-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 4 -// CHECK6-NEXT: [[TMP31:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 -// CHECK6-NEXT: [[TMP32:%.*]] = bitcast [10 x i32]* [[B4]] to i8* -// CHECK6-NEXT: store i8* [[TMP32]], i8** [[TMP31]], align 4 -// CHECK6-NEXT: [[TMP33:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 5 -// CHECK6-NEXT: [[TMP34:%.*]] = bitcast [10 x i32]* [[C1]] to i8* -// CHECK6-NEXT: store i8* [[TMP34]], i8** [[TMP33]], align 4 -// CHECK6-NEXT: [[TMP35:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 6 -// CHECK6-NEXT: [[TMP36:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* -// CHECK6-NEXT: store i8* [[TMP36]], i8** [[TMP35]], align 4 -// CHECK6-NEXT: [[TMP37:%.*]] = load i32, i32* [[TMP2]], align 4 -// CHECK6-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP37]], 0 -// CHECK6-NEXT: [[TMP38:%.*]] = zext i1 [[TOBOOL]] to i32 -// CHECK6-NEXT: [[TMP39:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: [[TMP40:%.*]] = load i32, i32* [[TMP39]], align 4 -// CHECK6-NEXT: [[TMP41:%.*]] = bitcast [7 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP40]], i32 [[TMP38]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32*, i32*, [10 x i32]*, [10 x i32]*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP41]], i32 7) -// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK6: omp.inner.for.inc: -// CHECK6-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK6-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP42]], [[TMP43]] -// CHECK6-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK6-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK6-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP44]], [[TMP45]] -// CHECK6-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK6-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK6-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP46]], [[TMP47]] -// CHECK6-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[TMP49:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK6-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[TMP48]], [[TMP49]] -// CHECK6-NEXT: br i1 [[CMP11]], label [[COND_TRUE12:%.*]], label [[COND_FALSE13:%.*]] -// CHECK6: cond.true12: -// CHECK6-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK6-NEXT: br label [[COND_END14:%.*]] -// CHECK6: cond.false13: -// CHECK6-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: br label [[COND_END14]] -// CHECK6: cond.end14: -// CHECK6-NEXT: [[COND15:%.*]] = phi i32 [ [[TMP50]], [[COND_TRUE12]] ], [ [[TMP51]], [[COND_FALSE13]] ] -// CHECK6-NEXT: store i32 [[COND15]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[TMP52:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK6-NEXT: store i32 [[TMP52]], i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK6: omp.inner.for.end: -// CHECK6-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK6: omp.loop.exit: -// CHECK6-NEXT: [[TMP53:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: [[TMP54:%.*]] = load i32, i32* [[TMP53]], align 4 -// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP54]]) -// CHECK6-NEXT: [[TMP55:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK6-NEXT: [[TMP56:%.*]] = icmp ne i32 [[TMP55]], 0 -// CHECK6-NEXT: br i1 [[TMP56]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK6: .omp.lastprivate.then: -// CHECK6-NEXT: [[TMP57:%.*]] = bitcast [10 x i32]* [[TMP1]] to i8* -// CHECK6-NEXT: [[TMP58:%.*]] = bitcast [10 x i32]* [[C1]] to i8* -// CHECK6-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP57]], i8* align 4 [[TMP58]], i32 40, i1 false) -// CHECK6-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK6: .omp.lastprivate.done: -// CHECK6-NEXT: br label [[OMP_PRECOND_END]] -// CHECK6: omp.precond.end: -// CHECK6-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP4]]) -// CHECK6-NEXT: ret void -// -// -// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK6-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK6-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[B3:%.*]] = alloca [10 x i32], align 4 -// CHECK6-NEXT: [[C4:%.*]] = alloca [10 x i32], align 4 -// CHECK6-NEXT: [[I5:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK6-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK6-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK6-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 4 -// CHECK6-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 -// CHECK6-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK6-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 -// CHECK6-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 -// CHECK6-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 4 -// CHECK6-NEXT: [[TMP1:%.*]] = load i32*, i32** [[A_ADDR]], align 4 -// CHECK6-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK6-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 -// CHECK6-NEXT: [[TMP4:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 -// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP0]], align 4 -// CHECK6-NEXT: store i32 [[TMP5]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK6-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP6]], 0 -// CHECK6-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK6-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK6-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK6-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK6-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP7]] -// CHECK6-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK6: omp.precond.then: -// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK6-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK6-NEXT: store i32 [[TMP8]], i32* [[DOTOMP_UB]], align 4 -// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_LB]], align 4 -// CHECK6-NEXT: store i32 [[TMP10]], i32* [[DOTOMP_UB]], align 4 -// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK6-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[B3]] to i8* -// CHECK6-NEXT: [[TMP12:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8* -// CHECK6-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP11]], i8* align 4 [[TMP12]], i32 40, i1 false) -// CHECK6-NEXT: [[TMP13:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4 -// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP14]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK6-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK6-NEXT: store i32 [[TMP15]], i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK6: omp.inner.for.cond: -// CHECK6-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK6-NEXT: [[CMP6:%.*]] = icmp ule i32 [[TMP16]], [[TMP17]] -// CHECK6-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK6: omp.inner.for.body: -// CHECK6-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 -// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK6-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 -// CHECK6-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I5]]) #[[ATTR5:[0-9]+]] -// CHECK6-NEXT: [[CALL7:%.*]] = call i32 @_Z3fooPi(i32* [[TMP1]]) #[[ATTR5]] -// CHECK6-NEXT: [[ADD8:%.*]] = add nsw i32 [[CALL]], [[CALL7]] -// CHECK6-NEXT: [[TMP19:%.*]] = load i32, i32* [[I5]], align 4 -// CHECK6-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B3]], i32 0, i32 [[TMP19]] -// CHECK6-NEXT: [[CALL9:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX]]) #[[ATTR5]] -// CHECK6-NEXT: [[ADD10:%.*]] = add nsw i32 [[ADD8]], [[CALL9]] -// CHECK6-NEXT: [[TMP20:%.*]] = load i32, i32* [[I5]], align 4 -// CHECK6-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[C4]], i32 0, i32 [[TMP20]] -// CHECK6-NEXT: [[CALL12:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX11]]) #[[ATTR5]] -// CHECK6-NEXT: [[ADD13:%.*]] = add nsw i32 [[ADD10]], [[CALL12]] -// CHECK6-NEXT: [[TMP21:%.*]] = load i32, i32* [[I5]], align 4 -// CHECK6-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP4]], i32 0, i32 [[TMP21]] -// CHECK6-NEXT: [[CALL15:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX14]]) #[[ATTR5]] -// CHECK6-NEXT: [[ADD16:%.*]] = add nsw i32 [[ADD13]], [[CALL15]] -// CHECK6-NEXT: store i32 [[ADD16]], i32* [[TMP1]], align 4 -// CHECK6-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK6: omp.body.continue: -// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK6: omp.inner.for.inc: -// CHECK6-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK6-NEXT: [[ADD17:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK6-NEXT: store i32 [[ADD17]], i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK6: omp.inner.for.end: -// CHECK6-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK6: omp.loop.exit: -// CHECK6-NEXT: [[TMP24:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: [[TMP25:%.*]] = load i32, i32* [[TMP24]], align 4 -// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP25]]) -// CHECK6-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK6-NEXT: [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0 -// CHECK6-NEXT: br i1 [[TMP27]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK6: .omp.lastprivate.then: -// CHECK6-NEXT: [[TMP28:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* -// CHECK6-NEXT: [[TMP29:%.*]] = bitcast [10 x i32]* [[C4]] to i8* -// CHECK6-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP28]], i8* align 4 [[TMP29]], i32 40, i1 false) -// CHECK6-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK6: .omp.lastprivate.done: -// CHECK6-NEXT: br label [[OMP_PRECOND_END]] -// CHECK6: omp.precond.end: -// CHECK6-NEXT: ret void -// -// // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l31 // CHECK7-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i64 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK7-NEXT: entry: @@ -2068,8 +69,6 @@ // CHECK7-NEXT: br label [[DOTEXIT:%.*]] // CHECK7: .exit: // CHECK7-NEXT: ret void -// -// // CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__ // CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i64 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { // CHECK7-NEXT: entry: @@ -2240,8 +239,6 @@ // CHECK7-NEXT: [[TMP64:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 // CHECK7-NEXT: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[TMP64]]) // CHECK7-NEXT: ret void -// -// // CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__1 // CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { // CHECK7-NEXT: entry: @@ -2369,1655 +366,3595 @@ // CHECK7-NEXT: br label [[OMP_PRECOND_END]] // CHECK7: omp.precond.end: // CHECK7-NEXT: ret void -// -// // CHECK8-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l31 // CHECK8-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i64 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK8-NEXT: entry: // CHECK8-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 // CHECK8-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 8 -// CHECK8-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 -// CHECK8-NEXT: [[ARGC_ADDR:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK8-NEXT: [[ARGC_ADDR:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK8-NEXT: [[ARGC_CASTED:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK8-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK8-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 8 +// CHECK8-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK8-NEXT: store i64 [[ARGC]], i64* [[ARGC_ADDR]], align 8 +// CHECK8-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 8 +// CHECK8-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK8-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 8 +// CHECK8-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK8-NEXT: [[CONV:%.*]] = bitcast i64* [[ARGC_ADDR]] to i32* +// CHECK8-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 8 +// CHECK8-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK8-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK8-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK8: .execute: +// CHECK8-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK8-NEXT: [[TMP5:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK8-NEXT: [[CONV1:%.*]] = bitcast i64* [[ARGC_CASTED]] to i32* +// CHECK8-NEXT: store i32 [[TMP5]], i32* [[CONV1]], align 4 +// CHECK8-NEXT: [[TMP6:%.*]] = load i64, i64* [[ARGC_CASTED]], align 8 +// CHECK8-NEXT: store i32 [[TMP4]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK8-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]], [10 x i32]* [[TMP1]], i32* [[TMP2]], i64 [[TMP6]], [10 x i32]* [[TMP3]]) #[[ATTR4:[0-9]+]] +// CHECK8-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK8: .omp.deinit: +// CHECK8-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK8-NEXT: br label [[DOTEXIT:%.*]] +// CHECK8: .exit: +// CHECK8-NEXT: ret void +// CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i64 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { +// CHECK8-NEXT: entry: +// CHECK8-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK8-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK8-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK8-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK8-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK8-NEXT: [[ARGC_ADDR:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK8-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[B4:%.*]] = alloca [10 x i32], align 4 +// CHECK8-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [7 x i8*], align 8 +// CHECK8-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK8-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK8-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 8 +// CHECK8-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK8-NEXT: store i64 [[ARGC]], i64* [[ARGC_ADDR]], align 8 +// CHECK8-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 8 +// CHECK8-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK8-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 8 +// CHECK8-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK8-NEXT: [[CONV:%.*]] = bitcast i64* [[ARGC_ADDR]] to i32* +// CHECK8-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 8 +// CHECK8-NEXT: [[TMP4:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i64 40, i16 1) +// CHECK8-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to %struct._globalized_locals_ty* +// CHECK8-NEXT: [[C1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP5]], i32 0, i32 0 +// CHECK8-NEXT: [[TMP6:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK8-NEXT: store i32 [[TMP6]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP7]], 0 +// CHECK8-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK8-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK8-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK8-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK8-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP8]] +// CHECK8-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK8: omp.precond.then: +// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK8-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK8-NEXT: [[TMP10:%.*]] = bitcast [10 x i32]* [[B4]] to i8* +// CHECK8-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK8-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP10]], i8* align 4 [[TMP11]], i64 40, i1 false) +// CHECK8-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK8-NEXT: [[TMP12:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4 +// CHECK8-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP13]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK8-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK8-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]] +// CHECK8-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK8: cond.true: +// CHECK8-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK8-NEXT: br label [[COND_END:%.*]] +// CHECK8: cond.false: +// CHECK8-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: br label [[COND_END]] +// CHECK8: cond.end: +// CHECK8-NEXT: [[COND:%.*]] = phi i32 [ [[TMP16]], [[COND_TRUE]] ], [ [[TMP17]], [[COND_FALSE]] ] +// CHECK8-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: store i32 [[TMP18]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK8: omp.inner.for.cond: +// CHECK8-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK8-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], 1 +// CHECK8-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP19]], [[ADD]] +// CHECK8-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK8: omp.inner.for.body: +// CHECK8-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: [[TMP22:%.*]] = zext i32 [[TMP21]] to i64 +// CHECK8-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP24:%.*]] = zext i32 [[TMP23]] to i64 +// CHECK8-NEXT: [[TMP25:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK8-NEXT: [[TMP26:%.*]] = inttoptr i64 [[TMP22]] to i8* +// CHECK8-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 8 +// CHECK8-NEXT: [[TMP27:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK8-NEXT: [[TMP28:%.*]] = inttoptr i64 [[TMP24]] to i8* +// CHECK8-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 8 +// CHECK8-NEXT: [[TMP29:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK8-NEXT: [[TMP30:%.*]] = bitcast i32* [[CONV]] to i8* +// CHECK8-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 8 +// CHECK8-NEXT: [[TMP31:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK8-NEXT: [[TMP32:%.*]] = bitcast i32* [[TMP2]] to i8* +// CHECK8-NEXT: store i8* [[TMP32]], i8** [[TMP31]], align 8 +// CHECK8-NEXT: [[TMP33:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 4 +// CHECK8-NEXT: [[TMP34:%.*]] = bitcast [10 x i32]* [[B4]] to i8* +// CHECK8-NEXT: store i8* [[TMP34]], i8** [[TMP33]], align 8 +// CHECK8-NEXT: [[TMP35:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 5 +// CHECK8-NEXT: [[TMP36:%.*]] = bitcast [10 x i32]* [[C1]] to i8* +// CHECK8-NEXT: store i8* [[TMP36]], i8** [[TMP35]], align 8 +// CHECK8-NEXT: [[TMP37:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 6 +// CHECK8-NEXT: [[TMP38:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* +// CHECK8-NEXT: store i8* [[TMP38]], i8** [[TMP37]], align 8 +// CHECK8-NEXT: [[TMP39:%.*]] = load i32, i32* [[TMP2]], align 4 +// CHECK8-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP39]], 0 +// CHECK8-NEXT: [[TMP40:%.*]] = zext i1 [[TOBOOL]] to i32 +// CHECK8-NEXT: [[TMP41:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: [[TMP42:%.*]] = load i32, i32* [[TMP41]], align 4 +// CHECK8-NEXT: [[TMP43:%.*]] = bitcast [7 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK8-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP42]], i32 [[TMP40]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i32*, i32*, [10 x i32]*, [10 x i32]*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP43]], i64 7) +// CHECK8-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK8: omp.inner.for.inc: +// CHECK8-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP44]], [[TMP45]] +// CHECK8-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP46]], [[TMP47]] +// CHECK8-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP49:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP48]], [[TMP49]] +// CHECK8-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK8-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[TMP50]], [[TMP51]] +// CHECK8-NEXT: br i1 [[CMP11]], label [[COND_TRUE12:%.*]], label [[COND_FALSE13:%.*]] +// CHECK8: cond.true12: +// CHECK8-NEXT: [[TMP52:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK8-NEXT: br label [[COND_END14:%.*]] +// CHECK8: cond.false13: +// CHECK8-NEXT: [[TMP53:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: br label [[COND_END14]] +// CHECK8: cond.end14: +// CHECK8-NEXT: [[COND15:%.*]] = phi i32 [ [[TMP52]], [[COND_TRUE12]] ], [ [[TMP53]], [[COND_FALSE13]] ] +// CHECK8-NEXT: store i32 [[COND15]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP54:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: store i32 [[TMP54]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK8: omp.inner.for.end: +// CHECK8-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK8: omp.loop.exit: +// CHECK8-NEXT: [[TMP55:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: [[TMP56:%.*]] = load i32, i32* [[TMP55]], align 4 +// CHECK8-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP56]]) +// CHECK8-NEXT: [[TMP57:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK8-NEXT: [[TMP58:%.*]] = icmp ne i32 [[TMP57]], 0 +// CHECK8-NEXT: br i1 [[TMP58]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK8: .omp.lastprivate.then: +// CHECK8-NEXT: [[TMP59:%.*]] = bitcast [10 x i32]* [[TMP1]] to i8* +// CHECK8-NEXT: [[TMP60:%.*]] = bitcast [10 x i32]* [[C1]] to i8* +// CHECK8-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP59]], i8* align 4 [[TMP60]], i64 40, i1 false) +// CHECK8-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK8: .omp.lastprivate.done: +// CHECK8-NEXT: br label [[OMP_PRECOND_END]] +// CHECK8: omp.precond.end: +// CHECK8-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP4]]) +// CHECK8-NEXT: ret void +// CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { +// CHECK8-NEXT: entry: +// CHECK8-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK8-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK8-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 8 +// CHECK8-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK8-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK8-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 8 // CHECK8-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 8 -// CHECK8-NEXT: [[ARGC_CASTED:%.*]] = alloca i64, align 8 -// CHECK8-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK8-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[B4:%.*]] = alloca [10 x i32], align 4 +// CHECK8-NEXT: [[C5:%.*]] = alloca [10 x i32], align 4 +// CHECK8-NEXT: [[I6:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK8-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK8-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK8-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 8 +// CHECK8-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 // CHECK8-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 // CHECK8-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 8 -// CHECK8-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 -// CHECK8-NEXT: store i64 [[ARGC]], i64* [[ARGC_ADDR]], align 8 // CHECK8-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 8 -// CHECK8-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 -// CHECK8-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 8 -// CHECK8-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 8 -// CHECK8-NEXT: [[CONV:%.*]] = bitcast i64* [[ARGC_ADDR]] to i32* -// CHECK8-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 8 -// CHECK8-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK8-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK8-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK8: .execute: -// CHECK8-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) -// CHECK8-NEXT: [[TMP5:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK8-NEXT: [[CONV1:%.*]] = bitcast i64* [[ARGC_CASTED]] to i32* -// CHECK8-NEXT: store i32 [[TMP5]], i32* [[CONV1]], align 4 -// CHECK8-NEXT: [[TMP6:%.*]] = load i64, i64* [[ARGC_CASTED]], align 8 -// CHECK8-NEXT: store i32 [[TMP4]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK8-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]], [10 x i32]* [[TMP1]], i32* [[TMP2]], i64 [[TMP6]], [10 x i32]* [[TMP3]]) #[[ATTR4:[0-9]+]] -// CHECK8-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK8: .omp.deinit: -// CHECK8-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK8-NEXT: br label [[DOTEXIT:%.*]] -// CHECK8: .exit: +// CHECK8-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 8 +// CHECK8-NEXT: [[TMP1:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK8-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK8-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 8 +// CHECK8-NEXT: [[TMP4:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 8 +// CHECK8-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK8-NEXT: store i32 [[TMP5]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP6]], 0 +// CHECK8-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK8-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK8-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK8-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK8-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP7]] +// CHECK8-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK8: omp.precond.then: +// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK8-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK8-NEXT: store i32 [[TMP8]], i32* [[DOTOMP_UB]], align 4 +// CHECK8-NEXT: [[TMP9:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK8-NEXT: [[CONV:%.*]] = trunc i64 [[TMP9]] to i32 +// CHECK8-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK8-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP10]] to i32 +// CHECK8-NEXT: store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4 +// CHECK8-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_UB]], align 4 +// CHECK8-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK8-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[B4]] to i8* +// CHECK8-NEXT: [[TMP12:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8* +// CHECK8-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP11]], i8* align 4 [[TMP12]], i64 40, i1 false) +// CHECK8-NEXT: [[TMP13:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4 +// CHECK8-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP14]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK8-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK8-NEXT: store i32 [[TMP15]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK8: omp.inner.for.cond: +// CHECK8-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[CONV7:%.*]] = sext i32 [[TMP16]] to i64 +// CHECK8-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK8-NEXT: [[CMP8:%.*]] = icmp ule i64 [[CONV7]], [[TMP17]] +// CHECK8-NEXT: br i1 [[CMP8]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK8: omp.inner.for.body: +// CHECK8-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK8-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK8-NEXT: store i32 [[ADD]], i32* [[I6]], align 4 +// CHECK8-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I6]]) #[[ATTR5:[0-9]+]] +// CHECK8-NEXT: [[CALL9:%.*]] = call i32 @_Z3fooPi(i32* [[TMP1]]) #[[ATTR5]] +// CHECK8-NEXT: [[ADD10:%.*]] = add nsw i32 [[CALL]], [[CALL9]] +// CHECK8-NEXT: [[TMP19:%.*]] = load i32, i32* [[I6]], align 4 +// CHECK8-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64 +// CHECK8-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B4]], i64 0, i64 [[IDXPROM]] +// CHECK8-NEXT: [[CALL11:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX]]) #[[ATTR5]] +// CHECK8-NEXT: [[ADD12:%.*]] = add nsw i32 [[ADD10]], [[CALL11]] +// CHECK8-NEXT: [[TMP20:%.*]] = load i32, i32* [[I6]], align 4 +// CHECK8-NEXT: [[IDXPROM13:%.*]] = sext i32 [[TMP20]] to i64 +// CHECK8-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[C5]], i64 0, i64 [[IDXPROM13]] +// CHECK8-NEXT: [[CALL15:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX14]]) #[[ATTR5]] +// CHECK8-NEXT: [[ADD16:%.*]] = add nsw i32 [[ADD12]], [[CALL15]] +// CHECK8-NEXT: [[TMP21:%.*]] = load i32, i32* [[I6]], align 4 +// CHECK8-NEXT: [[IDXPROM17:%.*]] = sext i32 [[TMP21]] to i64 +// CHECK8-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP4]], i64 0, i64 [[IDXPROM17]] +// CHECK8-NEXT: [[CALL19:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX18]]) #[[ATTR5]] +// CHECK8-NEXT: [[ADD20:%.*]] = add nsw i32 [[ADD16]], [[CALL19]] +// CHECK8-NEXT: store i32 [[ADD20]], i32* [[TMP1]], align 4 +// CHECK8-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK8: omp.body.continue: +// CHECK8-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK8: omp.inner.for.inc: +// CHECK8-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: [[ADD21:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK8-NEXT: store i32 [[ADD21]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK8: omp.inner.for.end: +// CHECK8-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK8: omp.loop.exit: +// CHECK8-NEXT: [[TMP24:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: [[TMP25:%.*]] = load i32, i32* [[TMP24]], align 4 +// CHECK8-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP25]]) +// CHECK8-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK8-NEXT: [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0 +// CHECK8-NEXT: br i1 [[TMP27]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK8: .omp.lastprivate.then: +// CHECK8-NEXT: [[TMP28:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* +// CHECK8-NEXT: [[TMP29:%.*]] = bitcast [10 x i32]* [[C5]] to i8* +// CHECK8-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP28]], i8* align 4 [[TMP29]], i64 40, i1 false) +// CHECK8-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK8: .omp.lastprivate.done: +// CHECK8-NEXT: br label [[OMP_PRECOND_END]] +// CHECK8: omp.precond.end: // CHECK8-NEXT: ret void +// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l31 +// CHECK9-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK9-NEXT: entry: +// CHECK9-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK9-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK9-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK9-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK9-NEXT: [[ARGC_CASTED:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK9-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK9-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 +// CHECK9-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK9-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 +// CHECK9-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 +// CHECK9-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK9-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 +// CHECK9-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK9-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 +// CHECK9-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK9-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK9-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK9: .execute: +// CHECK9-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK9-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 +// CHECK9-NEXT: store i32 [[TMP5]], i32* [[ARGC_CASTED]], align 4 +// CHECK9-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARGC_CASTED]], align 4 +// CHECK9-NEXT: store i32 [[TMP4]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK9-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]], [10 x i32]* [[TMP1]], i32* [[TMP2]], i32 [[TMP6]], [10 x i32]* [[TMP3]]) #[[ATTR4:[0-9]+]] +// CHECK9-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK9: .omp.deinit: +// CHECK9-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK9-NEXT: br label [[DOTEXIT:%.*]] +// CHECK9: .exit: +// CHECK9-NEXT: ret void +// CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { +// CHECK9-NEXT: entry: +// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK9-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK9-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK9-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK9-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK9-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[B4:%.*]] = alloca [10 x i32], align 4 +// CHECK9-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [7 x i8*], align 4 +// CHECK9-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK9-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK9-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK9-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 +// CHECK9-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK9-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 +// CHECK9-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 +// CHECK9-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK9-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 +// CHECK9-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK9-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 +// CHECK9-NEXT: [[TMP4:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK9-NEXT: [[TMP5:%.*]] = load i32, i32* @"_openmp_static_kernel$size", align 4 +// CHECK9-NEXT: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP5]], i16 [[TMP4]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) +// CHECK9-NEXT: [[TMP6:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4 +// CHECK9-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[TMP6]], i32 0 +// CHECK9-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP7]] to %struct._globalized_locals_ty* +// CHECK9-NEXT: [[C1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP8]], i32 0, i32 0 +// CHECK9-NEXT: [[TMP9:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 +// CHECK9-NEXT: store i32 [[TMP9]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP10]], 0 +// CHECK9-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK9-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK9-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK9-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK9-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP11]] +// CHECK9-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK9: omp.precond.then: +// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK9-NEXT: store i32 [[TMP12]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK9-NEXT: [[TMP13:%.*]] = bitcast [10 x i32]* [[B4]] to i8* +// CHECK9-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK9-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP13]], i8* align 4 [[TMP14]], i32 40, i1 false) +// CHECK9-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK9-NEXT: [[TMP15:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK9-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4 +// CHECK9-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP16]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK9-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK9-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]] +// CHECK9-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK9: cond.true: +// CHECK9-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK9-NEXT: br label [[COND_END:%.*]] +// CHECK9: cond.false: +// CHECK9-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: br label [[COND_END]] +// CHECK9: cond.end: +// CHECK9-NEXT: [[COND:%.*]] = phi i32 [ [[TMP19]], [[COND_TRUE]] ], [ [[TMP20]], [[COND_FALSE]] ] +// CHECK9-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK9: omp.inner.for.cond: +// CHECK9-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP23]], 1 +// CHECK9-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP22]], [[ADD]] +// CHECK9-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK9: omp.inner.for.body: +// CHECK9-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP26:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK9-NEXT: [[TMP27:%.*]] = inttoptr i32 [[TMP24]] to i8* +// CHECK9-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 4 +// CHECK9-NEXT: [[TMP28:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK9-NEXT: [[TMP29:%.*]] = inttoptr i32 [[TMP25]] to i8* +// CHECK9-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 4 +// CHECK9-NEXT: [[TMP30:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK9-NEXT: [[TMP31:%.*]] = bitcast i32* [[ARGC_ADDR]] to i8* +// CHECK9-NEXT: store i8* [[TMP31]], i8** [[TMP30]], align 4 +// CHECK9-NEXT: [[TMP32:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK9-NEXT: [[TMP33:%.*]] = bitcast i32* [[TMP2]] to i8* +// CHECK9-NEXT: store i8* [[TMP33]], i8** [[TMP32]], align 4 +// CHECK9-NEXT: [[TMP34:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK9-NEXT: [[TMP35:%.*]] = bitcast [10 x i32]* [[B4]] to i8* +// CHECK9-NEXT: store i8* [[TMP35]], i8** [[TMP34]], align 4 +// CHECK9-NEXT: [[TMP36:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 5 +// CHECK9-NEXT: [[TMP37:%.*]] = bitcast [10 x i32]* [[C1]] to i8* +// CHECK9-NEXT: store i8* [[TMP37]], i8** [[TMP36]], align 4 +// CHECK9-NEXT: [[TMP38:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 6 +// CHECK9-NEXT: [[TMP39:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* +// CHECK9-NEXT: store i8* [[TMP39]], i8** [[TMP38]], align 4 +// CHECK9-NEXT: [[TMP40:%.*]] = load i32, i32* [[TMP2]], align 4 +// CHECK9-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP40]], 0 +// CHECK9-NEXT: [[TMP41:%.*]] = zext i1 [[TOBOOL]] to i32 +// CHECK9-NEXT: [[TMP42:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK9-NEXT: [[TMP43:%.*]] = load i32, i32* [[TMP42]], align 4 +// CHECK9-NEXT: [[TMP44:%.*]] = bitcast [7 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK9-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP43]], i32 [[TMP41]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32*, i32*, [10 x i32]*, [10 x i32]*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP44]], i32 7) +// CHECK9-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK9: omp.inner.for.inc: +// CHECK9-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP45]], [[TMP46]] +// CHECK9-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP47]], [[TMP48]] +// CHECK9-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: [[TMP49:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP49]], [[TMP50]] +// CHECK9-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP52:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK9-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[TMP51]], [[TMP52]] +// CHECK9-NEXT: br i1 [[CMP11]], label [[COND_TRUE12:%.*]], label [[COND_FALSE13:%.*]] +// CHECK9: cond.true12: +// CHECK9-NEXT: [[TMP53:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK9-NEXT: br label [[COND_END14:%.*]] +// CHECK9: cond.false13: +// CHECK9-NEXT: [[TMP54:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: br label [[COND_END14]] +// CHECK9: cond.end14: +// CHECK9-NEXT: [[COND15:%.*]] = phi i32 [ [[TMP53]], [[COND_TRUE12]] ], [ [[TMP54]], [[COND_FALSE13]] ] +// CHECK9-NEXT: store i32 [[COND15]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP55:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: store i32 [[TMP55]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK9: omp.inner.for.end: +// CHECK9-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK9: omp.loop.exit: +// CHECK9-NEXT: [[TMP56:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK9-NEXT: [[TMP57:%.*]] = load i32, i32* [[TMP56]], align 4 +// CHECK9-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP57]]) +// CHECK9-NEXT: [[TMP58:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK9-NEXT: [[TMP59:%.*]] = icmp ne i32 [[TMP58]], 0 +// CHECK9-NEXT: br i1 [[TMP59]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK9: .omp.lastprivate.then: +// CHECK9-NEXT: [[TMP60:%.*]] = bitcast [10 x i32]* [[TMP1]] to i8* +// CHECK9-NEXT: [[TMP61:%.*]] = bitcast [10 x i32]* [[C1]] to i8* +// CHECK9-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP60]], i8* align 4 [[TMP61]], i32 40, i1 false) +// CHECK9-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK9: .omp.lastprivate.done: +// CHECK9-NEXT: br label [[OMP_PRECOND_END]] +// CHECK9: omp.precond.end: +// CHECK9-NEXT: [[TMP62:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK9-NEXT: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[TMP62]]) +// CHECK9-NEXT: ret void +// CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { +// CHECK9-NEXT: entry: +// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK9-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 4 +// CHECK9-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK9-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK9-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK9-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK9-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[B3:%.*]] = alloca [10 x i32], align 4 +// CHECK9-NEXT: [[C4:%.*]] = alloca [10 x i32], align 4 +// CHECK9-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK9-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK9-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK9-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK9-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 4 +// CHECK9-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK9-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK9-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 +// CHECK9-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 +// CHECK9-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 4 +// CHECK9-NEXT: [[TMP1:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK9-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK9-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 +// CHECK9-NEXT: [[TMP4:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 +// CHECK9-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK9-NEXT: store i32 [[TMP5]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP6]], 0 +// CHECK9-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK9-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK9-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK9-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK9-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP7]] +// CHECK9-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK9: omp.precond.then: +// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK9-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK9-NEXT: store i32 [[TMP8]], i32* [[DOTOMP_UB]], align 4 +// CHECK9-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK9-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK9-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_LB]], align 4 +// CHECK9-NEXT: store i32 [[TMP10]], i32* [[DOTOMP_UB]], align 4 +// CHECK9-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK9-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[B3]] to i8* +// CHECK9-NEXT: [[TMP12:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8* +// CHECK9-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP11]], i8* align 4 [[TMP12]], i32 40, i1 false) +// CHECK9-NEXT: [[TMP13:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK9-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4 +// CHECK9-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP14]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK9-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK9-NEXT: store i32 [[TMP15]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK9: omp.inner.for.cond: +// CHECK9-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK9-NEXT: [[CMP6:%.*]] = icmp ule i32 [[TMP16]], [[TMP17]] +// CHECK9-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK9: omp.inner.for.body: +// CHECK9-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK9-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 +// CHECK9-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I5]]) #[[ATTR5:[0-9]+]] +// CHECK9-NEXT: [[CALL7:%.*]] = call i32 @_Z3fooPi(i32* [[TMP1]]) #[[ATTR5]] +// CHECK9-NEXT: [[ADD8:%.*]] = add nsw i32 [[CALL]], [[CALL7]] +// CHECK9-NEXT: [[TMP19:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK9-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B3]], i32 0, i32 [[TMP19]] +// CHECK9-NEXT: [[CALL9:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX]]) #[[ATTR5]] +// CHECK9-NEXT: [[ADD10:%.*]] = add nsw i32 [[ADD8]], [[CALL9]] +// CHECK9-NEXT: [[TMP20:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK9-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[C4]], i32 0, i32 [[TMP20]] +// CHECK9-NEXT: [[CALL12:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX11]]) #[[ATTR5]] +// CHECK9-NEXT: [[ADD13:%.*]] = add nsw i32 [[ADD10]], [[CALL12]] +// CHECK9-NEXT: [[TMP21:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK9-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP4]], i32 0, i32 [[TMP21]] +// CHECK9-NEXT: [[CALL15:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX14]]) #[[ATTR5]] +// CHECK9-NEXT: [[ADD16:%.*]] = add nsw i32 [[ADD13]], [[CALL15]] +// CHECK9-NEXT: store i32 [[ADD16]], i32* [[TMP1]], align 4 +// CHECK9-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK9: omp.body.continue: +// CHECK9-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK9: omp.inner.for.inc: +// CHECK9-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: [[ADD17:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK9-NEXT: store i32 [[ADD17]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK9: omp.inner.for.end: +// CHECK9-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK9: omp.loop.exit: +// CHECK9-NEXT: [[TMP24:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK9-NEXT: [[TMP25:%.*]] = load i32, i32* [[TMP24]], align 4 +// CHECK9-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP25]]) +// CHECK9-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK9-NEXT: [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0 +// CHECK9-NEXT: br i1 [[TMP27]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK9: .omp.lastprivate.then: +// CHECK9-NEXT: [[TMP28:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* +// CHECK9-NEXT: [[TMP29:%.*]] = bitcast [10 x i32]* [[C4]] to i8* +// CHECK9-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP28]], i8* align 4 [[TMP29]], i32 40, i1 false) +// CHECK9-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK9: .omp.lastprivate.done: +// CHECK9-NEXT: br label [[OMP_PRECOND_END]] +// CHECK9: omp.precond.end: +// CHECK9-NEXT: ret void +// CHECK10-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l31 +// CHECK10-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK10-NEXT: entry: +// CHECK10-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK10-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK10-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK10-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK10-NEXT: [[ARGC_CASTED:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK10-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK10-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 +// CHECK10-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK10-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 +// CHECK10-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 +// CHECK10-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK10-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 +// CHECK10-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK10-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 +// CHECK10-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK10-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK10-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK10: .execute: +// CHECK10-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK10-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 +// CHECK10-NEXT: store i32 [[TMP5]], i32* [[ARGC_CASTED]], align 4 +// CHECK10-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARGC_CASTED]], align 4 +// CHECK10-NEXT: store i32 [[TMP4]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK10-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]], [10 x i32]* [[TMP1]], i32* [[TMP2]], i32 [[TMP6]], [10 x i32]* [[TMP3]]) #[[ATTR4:[0-9]+]] +// CHECK10-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK10: .omp.deinit: +// CHECK10-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK10-NEXT: br label [[DOTEXIT:%.*]] +// CHECK10: .exit: +// CHECK10-NEXT: ret void +// CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { +// CHECK10-NEXT: entry: +// CHECK10-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK10-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK10-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK10-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK10-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK10-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK10-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[B4:%.*]] = alloca [10 x i32], align 4 +// CHECK10-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [7 x i8*], align 4 +// CHECK10-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK10-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK10-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK10-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 +// CHECK10-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK10-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 +// CHECK10-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 +// CHECK10-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK10-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 +// CHECK10-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK10-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 +// CHECK10-NEXT: [[TMP4:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK10-NEXT: [[TMP5:%.*]] = load i32, i32* @"_openmp_static_kernel$size", align 4 +// CHECK10-NEXT: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP5]], i16 [[TMP4]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) +// CHECK10-NEXT: [[TMP6:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4 +// CHECK10-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[TMP6]], i32 0 +// CHECK10-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP7]] to %struct._globalized_locals_ty* +// CHECK10-NEXT: [[C1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP8]], i32 0, i32 0 +// CHECK10-NEXT: [[TMP9:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 +// CHECK10-NEXT: store i32 [[TMP9]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK10-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK10-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP10]], 0 +// CHECK10-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK10-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK10-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK10-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK10-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK10-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP11]] +// CHECK10-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK10: omp.precond.then: +// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK10-NEXT: store i32 [[TMP12]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK10-NEXT: [[TMP13:%.*]] = bitcast [10 x i32]* [[B4]] to i8* +// CHECK10-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK10-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP13]], i8* align 4 [[TMP14]], i32 40, i1 false) +// CHECK10-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK10-NEXT: [[TMP15:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK10-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4 +// CHECK10-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP16]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK10-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK10-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]] +// CHECK10-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK10: cond.true: +// CHECK10-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK10-NEXT: br label [[COND_END:%.*]] +// CHECK10: cond.false: +// CHECK10-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: br label [[COND_END]] +// CHECK10: cond.end: +// CHECK10-NEXT: [[COND:%.*]] = phi i32 [ [[TMP19]], [[COND_TRUE]] ], [ [[TMP20]], [[COND_FALSE]] ] +// CHECK10-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK10: omp.inner.for.cond: +// CHECK10-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK10-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP23]], 1 +// CHECK10-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP22]], [[ADD]] +// CHECK10-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK10: omp.inner.for.body: +// CHECK10-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP26:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK10-NEXT: [[TMP27:%.*]] = inttoptr i32 [[TMP24]] to i8* +// CHECK10-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 4 +// CHECK10-NEXT: [[TMP28:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK10-NEXT: [[TMP29:%.*]] = inttoptr i32 [[TMP25]] to i8* +// CHECK10-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 4 +// CHECK10-NEXT: [[TMP30:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK10-NEXT: [[TMP31:%.*]] = bitcast i32* [[ARGC_ADDR]] to i8* +// CHECK10-NEXT: store i8* [[TMP31]], i8** [[TMP30]], align 4 +// CHECK10-NEXT: [[TMP32:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK10-NEXT: [[TMP33:%.*]] = bitcast i32* [[TMP2]] to i8* +// CHECK10-NEXT: store i8* [[TMP33]], i8** [[TMP32]], align 4 +// CHECK10-NEXT: [[TMP34:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK10-NEXT: [[TMP35:%.*]] = bitcast [10 x i32]* [[B4]] to i8* +// CHECK10-NEXT: store i8* [[TMP35]], i8** [[TMP34]], align 4 +// CHECK10-NEXT: [[TMP36:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 5 +// CHECK10-NEXT: [[TMP37:%.*]] = bitcast [10 x i32]* [[C1]] to i8* +// CHECK10-NEXT: store i8* [[TMP37]], i8** [[TMP36]], align 4 +// CHECK10-NEXT: [[TMP38:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 6 +// CHECK10-NEXT: [[TMP39:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* +// CHECK10-NEXT: store i8* [[TMP39]], i8** [[TMP38]], align 4 +// CHECK10-NEXT: [[TMP40:%.*]] = load i32, i32* [[TMP2]], align 4 +// CHECK10-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP40]], 0 +// CHECK10-NEXT: [[TMP41:%.*]] = zext i1 [[TOBOOL]] to i32 +// CHECK10-NEXT: [[TMP42:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK10-NEXT: [[TMP43:%.*]] = load i32, i32* [[TMP42]], align 4 +// CHECK10-NEXT: [[TMP44:%.*]] = bitcast [7 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK10-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP43]], i32 [[TMP41]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32*, i32*, [10 x i32]*, [10 x i32]*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP44]], i32 7) +// CHECK10-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK10: omp.inner.for.inc: +// CHECK10-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP45]], [[TMP46]] +// CHECK10-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP47]], [[TMP48]] +// CHECK10-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: [[TMP49:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP49]], [[TMP50]] +// CHECK10-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP52:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK10-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[TMP51]], [[TMP52]] +// CHECK10-NEXT: br i1 [[CMP11]], label [[COND_TRUE12:%.*]], label [[COND_FALSE13:%.*]] +// CHECK10: cond.true12: +// CHECK10-NEXT: [[TMP53:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK10-NEXT: br label [[COND_END14:%.*]] +// CHECK10: cond.false13: +// CHECK10-NEXT: [[TMP54:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: br label [[COND_END14]] +// CHECK10: cond.end14: +// CHECK10-NEXT: [[COND15:%.*]] = phi i32 [ [[TMP53]], [[COND_TRUE12]] ], [ [[TMP54]], [[COND_FALSE13]] ] +// CHECK10-NEXT: store i32 [[COND15]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP55:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: store i32 [[TMP55]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK10: omp.inner.for.end: +// CHECK10-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK10: omp.loop.exit: +// CHECK10-NEXT: [[TMP56:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK10-NEXT: [[TMP57:%.*]] = load i32, i32* [[TMP56]], align 4 +// CHECK10-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP57]]) +// CHECK10-NEXT: [[TMP58:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK10-NEXT: [[TMP59:%.*]] = icmp ne i32 [[TMP58]], 0 +// CHECK10-NEXT: br i1 [[TMP59]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK10: .omp.lastprivate.then: +// CHECK10-NEXT: [[TMP60:%.*]] = bitcast [10 x i32]* [[TMP1]] to i8* +// CHECK10-NEXT: [[TMP61:%.*]] = bitcast [10 x i32]* [[C1]] to i8* +// CHECK10-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP60]], i8* align 4 [[TMP61]], i32 40, i1 false) +// CHECK10-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK10: .omp.lastprivate.done: +// CHECK10-NEXT: br label [[OMP_PRECOND_END]] +// CHECK10: omp.precond.end: +// CHECK10-NEXT: [[TMP62:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK10-NEXT: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[TMP62]]) +// CHECK10-NEXT: ret void +// CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { +// CHECK10-NEXT: entry: +// CHECK10-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK10-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK10-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 4 +// CHECK10-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK10-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK10-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK10-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK10-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[B3:%.*]] = alloca [10 x i32], align 4 +// CHECK10-NEXT: [[C4:%.*]] = alloca [10 x i32], align 4 +// CHECK10-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK10-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK10-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK10-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK10-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 4 +// CHECK10-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK10-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK10-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 +// CHECK10-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 +// CHECK10-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 4 +// CHECK10-NEXT: [[TMP1:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK10-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK10-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 +// CHECK10-NEXT: [[TMP4:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 +// CHECK10-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK10-NEXT: store i32 [[TMP5]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK10-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK10-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP6]], 0 +// CHECK10-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK10-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK10-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK10-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK10-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK10-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP7]] +// CHECK10-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK10: omp.precond.then: +// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK10-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK10-NEXT: store i32 [[TMP8]], i32* [[DOTOMP_UB]], align 4 +// CHECK10-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK10-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK10-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_LB]], align 4 +// CHECK10-NEXT: store i32 [[TMP10]], i32* [[DOTOMP_UB]], align 4 +// CHECK10-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK10-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[B3]] to i8* +// CHECK10-NEXT: [[TMP12:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8* +// CHECK10-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP11]], i8* align 4 [[TMP12]], i32 40, i1 false) +// CHECK10-NEXT: [[TMP13:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK10-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4 +// CHECK10-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP14]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK10-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK10-NEXT: store i32 [[TMP15]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK10: omp.inner.for.cond: +// CHECK10-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK10-NEXT: [[CMP6:%.*]] = icmp ule i32 [[TMP16]], [[TMP17]] +// CHECK10-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK10: omp.inner.for.body: +// CHECK10-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK10-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK10-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 +// CHECK10-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I5]]) #[[ATTR5:[0-9]+]] +// CHECK10-NEXT: [[CALL7:%.*]] = call i32 @_Z3fooPi(i32* [[TMP1]]) #[[ATTR5]] +// CHECK10-NEXT: [[ADD8:%.*]] = add nsw i32 [[CALL]], [[CALL7]] +// CHECK10-NEXT: [[TMP19:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK10-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B3]], i32 0, i32 [[TMP19]] +// CHECK10-NEXT: [[CALL9:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX]]) #[[ATTR5]] +// CHECK10-NEXT: [[ADD10:%.*]] = add nsw i32 [[ADD8]], [[CALL9]] +// CHECK10-NEXT: [[TMP20:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK10-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[C4]], i32 0, i32 [[TMP20]] +// CHECK10-NEXT: [[CALL12:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX11]]) #[[ATTR5]] +// CHECK10-NEXT: [[ADD13:%.*]] = add nsw i32 [[ADD10]], [[CALL12]] +// CHECK10-NEXT: [[TMP21:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK10-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP4]], i32 0, i32 [[TMP21]] +// CHECK10-NEXT: [[CALL15:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX14]]) #[[ATTR5]] +// CHECK10-NEXT: [[ADD16:%.*]] = add nsw i32 [[ADD13]], [[CALL15]] +// CHECK10-NEXT: store i32 [[ADD16]], i32* [[TMP1]], align 4 +// CHECK10-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK10: omp.body.continue: +// CHECK10-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK10: omp.inner.for.inc: +// CHECK10-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: [[ADD17:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK10-NEXT: store i32 [[ADD17]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK10: omp.inner.for.end: +// CHECK10-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK10: omp.loop.exit: +// CHECK10-NEXT: [[TMP24:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK10-NEXT: [[TMP25:%.*]] = load i32, i32* [[TMP24]], align 4 +// CHECK10-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP25]]) +// CHECK10-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK10-NEXT: [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0 +// CHECK10-NEXT: br i1 [[TMP27]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK10: .omp.lastprivate.then: +// CHECK10-NEXT: [[TMP28:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* +// CHECK10-NEXT: [[TMP29:%.*]] = bitcast [10 x i32]* [[C4]] to i8* +// CHECK10-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP28]], i8* align 4 [[TMP29]], i32 40, i1 false) +// CHECK10-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK10: .omp.lastprivate.done: +// CHECK10-NEXT: br label [[OMP_PRECOND_END]] +// CHECK10: omp.precond.end: +// CHECK10-NEXT: ret void +// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l31 +// CHECK11-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK11-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK11-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK11-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK11-NEXT: [[ARGC_CASTED:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK11-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK11-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 +// CHECK11-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 +// CHECK11-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 +// CHECK11-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK11-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 +// CHECK11-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK11-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 +// CHECK11-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK11-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK11-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK11: .execute: +// CHECK11-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK11-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP5]], i32* [[ARGC_CASTED]], align 4 +// CHECK11-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARGC_CASTED]], align 4 +// CHECK11-NEXT: store i32 [[TMP4]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK11-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]], [10 x i32]* [[TMP1]], i32* [[TMP2]], i32 [[TMP6]], [10 x i32]* [[TMP3]]) #[[ATTR4:[0-9]+]] +// CHECK11-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK11: .omp.deinit: +// CHECK11-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK11-NEXT: br label [[DOTEXIT:%.*]] +// CHECK11: .exit: +// CHECK11-NEXT: ret void +// CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK11-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK11-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK11-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK11-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[B4:%.*]] = alloca [10 x i32], align 4 +// CHECK11-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [7 x i8*], align 4 +// CHECK11-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK11-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK11-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 +// CHECK11-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 +// CHECK11-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 +// CHECK11-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK11-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 +// CHECK11-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK11-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 +// CHECK11-NEXT: [[TMP4:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 40, i16 1) +// CHECK11-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to %struct._globalized_locals_ty* +// CHECK11-NEXT: [[C1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP5]], i32 0, i32 0 +// CHECK11-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP6]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP7]], 0 +// CHECK11-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK11-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK11-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK11-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP8]] +// CHECK11-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK11: omp.precond.then: +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK11-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP10:%.*]] = bitcast [10 x i32]* [[B4]] to i8* +// CHECK11-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK11-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP10]], i8* align 4 [[TMP11]], i32 40, i1 false) +// CHECK11-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK11-NEXT: [[TMP12:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP13]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK11-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK11-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]] +// CHECK11-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK11: cond.true: +// CHECK11-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK11-NEXT: br label [[COND_END:%.*]] +// CHECK11: cond.false: +// CHECK11-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: br label [[COND_END]] +// CHECK11: cond.end: +// CHECK11-NEXT: [[COND:%.*]] = phi i32 [ [[TMP16]], [[COND_TRUE]] ], [ [[TMP17]], [[COND_FALSE]] ] +// CHECK11-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP18]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK11: omp.inner.for.cond: +// CHECK11-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], 1 +// CHECK11-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP19]], [[ADD]] +// CHECK11-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK11: omp.inner.for.body: +// CHECK11-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP23:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK11-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP21]] to i8* +// CHECK11-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 +// CHECK11-NEXT: [[TMP25:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK11-NEXT: [[TMP26:%.*]] = inttoptr i32 [[TMP22]] to i8* +// CHECK11-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 +// CHECK11-NEXT: [[TMP27:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK11-NEXT: [[TMP28:%.*]] = bitcast i32* [[ARGC_ADDR]] to i8* +// CHECK11-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 +// CHECK11-NEXT: [[TMP29:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK11-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP2]] to i8* +// CHECK11-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 4 +// CHECK11-NEXT: [[TMP31:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK11-NEXT: [[TMP32:%.*]] = bitcast [10 x i32]* [[B4]] to i8* +// CHECK11-NEXT: store i8* [[TMP32]], i8** [[TMP31]], align 4 +// CHECK11-NEXT: [[TMP33:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 5 +// CHECK11-NEXT: [[TMP34:%.*]] = bitcast [10 x i32]* [[C1]] to i8* +// CHECK11-NEXT: store i8* [[TMP34]], i8** [[TMP33]], align 4 +// CHECK11-NEXT: [[TMP35:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 6 +// CHECK11-NEXT: [[TMP36:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* +// CHECK11-NEXT: store i8* [[TMP36]], i8** [[TMP35]], align 4 +// CHECK11-NEXT: [[TMP37:%.*]] = load i32, i32* [[TMP2]], align 4 +// CHECK11-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP37]], 0 +// CHECK11-NEXT: [[TMP38:%.*]] = zext i1 [[TOBOOL]] to i32 +// CHECK11-NEXT: [[TMP39:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: [[TMP40:%.*]] = load i32, i32* [[TMP39]], align 4 +// CHECK11-NEXT: [[TMP41:%.*]] = bitcast [7 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK11-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP40]], i32 [[TMP38]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32*, i32*, [10 x i32]*, [10 x i32]*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP41]], i32 7) +// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK11: omp.inner.for.inc: +// CHECK11-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP42]], [[TMP43]] +// CHECK11-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP44]], [[TMP45]] +// CHECK11-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP46]], [[TMP47]] +// CHECK11-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP49:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK11-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[TMP48]], [[TMP49]] +// CHECK11-NEXT: br i1 [[CMP11]], label [[COND_TRUE12:%.*]], label [[COND_FALSE13:%.*]] +// CHECK11: cond.true12: +// CHECK11-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK11-NEXT: br label [[COND_END14:%.*]] +// CHECK11: cond.false13: +// CHECK11-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: br label [[COND_END14]] +// CHECK11: cond.end14: +// CHECK11-NEXT: [[COND15:%.*]] = phi i32 [ [[TMP50]], [[COND_TRUE12]] ], [ [[TMP51]], [[COND_FALSE13]] ] +// CHECK11-NEXT: store i32 [[COND15]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP52:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP52]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK11: omp.inner.for.end: +// CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK11: omp.loop.exit: +// CHECK11-NEXT: [[TMP53:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: [[TMP54:%.*]] = load i32, i32* [[TMP53]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP54]]) +// CHECK11-NEXT: [[TMP55:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP56:%.*]] = icmp ne i32 [[TMP55]], 0 +// CHECK11-NEXT: br i1 [[TMP56]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK11: .omp.lastprivate.then: +// CHECK11-NEXT: [[TMP57:%.*]] = bitcast [10 x i32]* [[TMP1]] to i8* +// CHECK11-NEXT: [[TMP58:%.*]] = bitcast [10 x i32]* [[C1]] to i8* +// CHECK11-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP57]], i8* align 4 [[TMP58]], i32 40, i1 false) +// CHECK11-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK11: .omp.lastprivate.done: +// CHECK11-NEXT: br label [[OMP_PRECOND_END]] +// CHECK11: omp.precond.end: +// CHECK11-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP4]]) +// CHECK11-NEXT: ret void +// CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK11-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 4 +// CHECK11-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK11-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK11-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK11-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[B3:%.*]] = alloca [10 x i32], align 4 +// CHECK11-NEXT: [[C4:%.*]] = alloca [10 x i32], align 4 +// CHECK11-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK11-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK11-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK11-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 4 +// CHECK11-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK11-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK11-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 +// CHECK11-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 +// CHECK11-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 4 +// CHECK11-NEXT: [[TMP1:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK11-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK11-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 +// CHECK11-NEXT: [[TMP4:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 +// CHECK11-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK11-NEXT: store i32 [[TMP5]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP6]], 0 +// CHECK11-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK11-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK11-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK11-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP7]] +// CHECK11-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK11: omp.precond.then: +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: store i32 [[TMP8]], i32* [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK11-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP10]], i32* [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[B3]] to i8* +// CHECK11-NEXT: [[TMP12:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8* +// CHECK11-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP11]], i8* align 4 [[TMP12]], i32 40, i1 false) +// CHECK11-NEXT: [[TMP13:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP14]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK11-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP15]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK11: omp.inner.for.cond: +// CHECK11-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK11-NEXT: [[CMP6:%.*]] = icmp ule i32 [[TMP16]], [[TMP17]] +// CHECK11-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK11: omp.inner.for.body: +// CHECK11-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK11-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 +// CHECK11-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I5]]) #[[ATTR5:[0-9]+]] +// CHECK11-NEXT: [[CALL7:%.*]] = call i32 @_Z3fooPi(i32* [[TMP1]]) #[[ATTR5]] +// CHECK11-NEXT: [[ADD8:%.*]] = add nsw i32 [[CALL]], [[CALL7]] +// CHECK11-NEXT: [[TMP19:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK11-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B3]], i32 0, i32 [[TMP19]] +// CHECK11-NEXT: [[CALL9:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX]]) #[[ATTR5]] +// CHECK11-NEXT: [[ADD10:%.*]] = add nsw i32 [[ADD8]], [[CALL9]] +// CHECK11-NEXT: [[TMP20:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK11-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[C4]], i32 0, i32 [[TMP20]] +// CHECK11-NEXT: [[CALL12:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX11]]) #[[ATTR5]] +// CHECK11-NEXT: [[ADD13:%.*]] = add nsw i32 [[ADD10]], [[CALL12]] +// CHECK11-NEXT: [[TMP21:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK11-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP4]], i32 0, i32 [[TMP21]] +// CHECK11-NEXT: [[CALL15:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX14]]) #[[ATTR5]] +// CHECK11-NEXT: [[ADD16:%.*]] = add nsw i32 [[ADD13]], [[CALL15]] +// CHECK11-NEXT: store i32 [[ADD16]], i32* [[TMP1]], align 4 +// CHECK11-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK11: omp.body.continue: +// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK11: omp.inner.for.inc: +// CHECK11-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD17:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK11-NEXT: store i32 [[ADD17]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK11: omp.inner.for.end: +// CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK11: omp.loop.exit: +// CHECK11-NEXT: [[TMP24:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: [[TMP25:%.*]] = load i32, i32* [[TMP24]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP25]]) +// CHECK11-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0 +// CHECK11-NEXT: br i1 [[TMP27]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK11: .omp.lastprivate.then: +// CHECK11-NEXT: [[TMP28:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* +// CHECK11-NEXT: [[TMP29:%.*]] = bitcast [10 x i32]* [[C4]] to i8* +// CHECK11-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP28]], i8* align 4 [[TMP29]], i32 40, i1 false) +// CHECK11-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK11: .omp.lastprivate.done: +// CHECK11-NEXT: br label [[OMP_PRECOND_END]] +// CHECK11: omp.precond.end: +// CHECK11-NEXT: ret void +// CHECK12-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l31 +// CHECK12-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK12-NEXT: entry: +// CHECK12-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK12-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK12-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK12-NEXT: [[ARGC_CASTED:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK12-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK12-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 +// CHECK12-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 +// CHECK12-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 +// CHECK12-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK12-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 +// CHECK12-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK12-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 +// CHECK12-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK12-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK12-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK12: .execute: +// CHECK12-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK12-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP5]], i32* [[ARGC_CASTED]], align 4 +// CHECK12-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARGC_CASTED]], align 4 +// CHECK12-NEXT: store i32 [[TMP4]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK12-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]], [10 x i32]* [[TMP1]], i32* [[TMP2]], i32 [[TMP6]], [10 x i32]* [[TMP3]]) #[[ATTR4:[0-9]+]] +// CHECK12-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK12: .omp.deinit: +// CHECK12-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK12-NEXT: br label [[DOTEXIT:%.*]] +// CHECK12: .exit: +// CHECK12-NEXT: ret void +// CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { +// CHECK12-NEXT: entry: +// CHECK12-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK12-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK12-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK12-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[B4:%.*]] = alloca [10 x i32], align 4 +// CHECK12-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [7 x i8*], align 4 +// CHECK12-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK12-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK12-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 +// CHECK12-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 +// CHECK12-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 +// CHECK12-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK12-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 +// CHECK12-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK12-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 +// CHECK12-NEXT: [[TMP4:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 40, i16 1) +// CHECK12-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to %struct._globalized_locals_ty* +// CHECK12-NEXT: [[C1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP5]], i32 0, i32 0 +// CHECK12-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP6]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP7]], 0 +// CHECK12-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK12-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK12-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK12-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP8]] +// CHECK12-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK12: omp.precond.then: +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK12-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[TMP10:%.*]] = bitcast [10 x i32]* [[B4]] to i8* +// CHECK12-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK12-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP10]], i8* align 4 [[TMP11]], i32 40, i1 false) +// CHECK12-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK12-NEXT: [[TMP12:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4 +// CHECK12-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP13]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK12-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK12-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]] +// CHECK12-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK12: cond.true: +// CHECK12-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK12-NEXT: br label [[COND_END:%.*]] +// CHECK12: cond.false: +// CHECK12-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: br label [[COND_END]] +// CHECK12: cond.end: +// CHECK12-NEXT: [[COND:%.*]] = phi i32 [ [[TMP16]], [[COND_TRUE]] ], [ [[TMP17]], [[COND_FALSE]] ] +// CHECK12-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP18]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK12: omp.inner.for.cond: +// CHECK12-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK12-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], 1 +// CHECK12-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP19]], [[ADD]] +// CHECK12-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK12: omp.inner.for.body: +// CHECK12-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP23:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK12-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP21]] to i8* +// CHECK12-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 +// CHECK12-NEXT: [[TMP25:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK12-NEXT: [[TMP26:%.*]] = inttoptr i32 [[TMP22]] to i8* +// CHECK12-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 +// CHECK12-NEXT: [[TMP27:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK12-NEXT: [[TMP28:%.*]] = bitcast i32* [[ARGC_ADDR]] to i8* +// CHECK12-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 +// CHECK12-NEXT: [[TMP29:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK12-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP2]] to i8* +// CHECK12-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 4 +// CHECK12-NEXT: [[TMP31:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK12-NEXT: [[TMP32:%.*]] = bitcast [10 x i32]* [[B4]] to i8* +// CHECK12-NEXT: store i8* [[TMP32]], i8** [[TMP31]], align 4 +// CHECK12-NEXT: [[TMP33:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 5 +// CHECK12-NEXT: [[TMP34:%.*]] = bitcast [10 x i32]* [[C1]] to i8* +// CHECK12-NEXT: store i8* [[TMP34]], i8** [[TMP33]], align 4 +// CHECK12-NEXT: [[TMP35:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 6 +// CHECK12-NEXT: [[TMP36:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* +// CHECK12-NEXT: store i8* [[TMP36]], i8** [[TMP35]], align 4 +// CHECK12-NEXT: [[TMP37:%.*]] = load i32, i32* [[TMP2]], align 4 +// CHECK12-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP37]], 0 +// CHECK12-NEXT: [[TMP38:%.*]] = zext i1 [[TOBOOL]] to i32 +// CHECK12-NEXT: [[TMP39:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP40:%.*]] = load i32, i32* [[TMP39]], align 4 +// CHECK12-NEXT: [[TMP41:%.*]] = bitcast [7 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK12-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP40]], i32 [[TMP38]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32*, i32*, [10 x i32]*, [10 x i32]*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP41]], i32 7) +// CHECK12-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK12: omp.inner.for.inc: +// CHECK12-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP42]], [[TMP43]] +// CHECK12-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP44]], [[TMP45]] +// CHECK12-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP46]], [[TMP47]] +// CHECK12-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP49:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK12-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[TMP48]], [[TMP49]] +// CHECK12-NEXT: br i1 [[CMP11]], label [[COND_TRUE12:%.*]], label [[COND_FALSE13:%.*]] +// CHECK12: cond.true12: +// CHECK12-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK12-NEXT: br label [[COND_END14:%.*]] +// CHECK12: cond.false13: +// CHECK12-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: br label [[COND_END14]] +// CHECK12: cond.end14: +// CHECK12-NEXT: [[COND15:%.*]] = phi i32 [ [[TMP50]], [[COND_TRUE12]] ], [ [[TMP51]], [[COND_FALSE13]] ] +// CHECK12-NEXT: store i32 [[COND15]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP52:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP52]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK12: omp.inner.for.end: +// CHECK12-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK12: omp.loop.exit: +// CHECK12-NEXT: [[TMP53:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP54:%.*]] = load i32, i32* [[TMP53]], align 4 +// CHECK12-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP54]]) +// CHECK12-NEXT: [[TMP55:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[TMP56:%.*]] = icmp ne i32 [[TMP55]], 0 +// CHECK12-NEXT: br i1 [[TMP56]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK12: .omp.lastprivate.then: +// CHECK12-NEXT: [[TMP57:%.*]] = bitcast [10 x i32]* [[TMP1]] to i8* +// CHECK12-NEXT: [[TMP58:%.*]] = bitcast [10 x i32]* [[C1]] to i8* +// CHECK12-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP57]], i8* align 4 [[TMP58]], i32 40, i1 false) +// CHECK12-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK12: .omp.lastprivate.done: +// CHECK12-NEXT: br label [[OMP_PRECOND_END]] +// CHECK12: omp.precond.end: +// CHECK12-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP4]]) +// CHECK12-NEXT: ret void +// CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { +// CHECK12-NEXT: entry: +// CHECK12-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK12-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK12-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK12-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[B3:%.*]] = alloca [10 x i32], align 4 +// CHECK12-NEXT: [[C4:%.*]] = alloca [10 x i32], align 4 +// CHECK12-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK12-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 4 +// CHECK12-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK12-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK12-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 +// CHECK12-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 +// CHECK12-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 4 +// CHECK12-NEXT: [[TMP1:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK12-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK12-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 +// CHECK12-NEXT: [[TMP4:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 +// CHECK12-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK12-NEXT: store i32 [[TMP5]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP6]], 0 +// CHECK12-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK12-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK12-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK12-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP7]] +// CHECK12-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK12: omp.precond.then: +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK12-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK12-NEXT: store i32 [[TMP8]], i32* [[DOTOMP_UB]], align 4 +// CHECK12-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK12-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP10]], i32* [[DOTOMP_UB]], align 4 +// CHECK12-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[B3]] to i8* +// CHECK12-NEXT: [[TMP12:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8* +// CHECK12-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP11]], i8* align 4 [[TMP12]], i32 40, i1 false) +// CHECK12-NEXT: [[TMP13:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4 +// CHECK12-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP14]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK12-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP15]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK12: omp.inner.for.cond: +// CHECK12-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK12-NEXT: [[CMP6:%.*]] = icmp ule i32 [[TMP16]], [[TMP17]] +// CHECK12-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK12: omp.inner.for.body: +// CHECK12-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK12-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK12-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 +// CHECK12-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I5]]) #[[ATTR5:[0-9]+]] +// CHECK12-NEXT: [[CALL7:%.*]] = call i32 @_Z3fooPi(i32* [[TMP1]]) #[[ATTR5]] +// CHECK12-NEXT: [[ADD8:%.*]] = add nsw i32 [[CALL]], [[CALL7]] +// CHECK12-NEXT: [[TMP19:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK12-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B3]], i32 0, i32 [[TMP19]] +// CHECK12-NEXT: [[CALL9:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX]]) #[[ATTR5]] +// CHECK12-NEXT: [[ADD10:%.*]] = add nsw i32 [[ADD8]], [[CALL9]] +// CHECK12-NEXT: [[TMP20:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK12-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[C4]], i32 0, i32 [[TMP20]] +// CHECK12-NEXT: [[CALL12:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX11]]) #[[ATTR5]] +// CHECK12-NEXT: [[ADD13:%.*]] = add nsw i32 [[ADD10]], [[CALL12]] +// CHECK12-NEXT: [[TMP21:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK12-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP4]], i32 0, i32 [[TMP21]] +// CHECK12-NEXT: [[CALL15:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX14]]) #[[ATTR5]] +// CHECK12-NEXT: [[ADD16:%.*]] = add nsw i32 [[ADD13]], [[CALL15]] +// CHECK12-NEXT: store i32 [[ADD16]], i32* [[TMP1]], align 4 +// CHECK12-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK12: omp.body.continue: +// CHECK12-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK12: omp.inner.for.inc: +// CHECK12-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD17:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK12-NEXT: store i32 [[ADD17]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK12: omp.inner.for.end: +// CHECK12-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK12: omp.loop.exit: +// CHECK12-NEXT: [[TMP24:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP25:%.*]] = load i32, i32* [[TMP24]], align 4 +// CHECK12-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP25]]) +// CHECK12-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0 +// CHECK12-NEXT: br i1 [[TMP27]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK12: .omp.lastprivate.then: +// CHECK12-NEXT: [[TMP28:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* +// CHECK12-NEXT: [[TMP29:%.*]] = bitcast [10 x i32]* [[C4]] to i8* +// CHECK12-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP28]], i8* align 4 [[TMP29]], i32 40, i1 false) +// CHECK12-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK12: .omp.lastprivate.done: +// CHECK12-NEXT: br label [[OMP_PRECOND_END]] +// CHECK12: omp.precond.end: +// CHECK12-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25 +// CHECK1-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i64 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[ARGC_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK1-NEXT: [[ARGC_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 8 +// CHECK1-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[ARGC]], i64* [[ARGC_ADDR]], align 8 +// CHECK1-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 8 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[ARGC_ADDR]] to i32* +// CHECK1-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 8 +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK1: .execute: +// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[ARGC_CASTED]] to i32* +// CHECK1-NEXT: store i32 [[TMP5]], i32* [[CONV1]], align 4 +// CHECK1-NEXT: [[TMP6:%.*]] = load i64, i64* [[ARGC_CASTED]], align 8 +// CHECK1-NEXT: store i32 [[TMP4]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK1-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]], [10 x i32]* [[TMP1]], i32* [[TMP2]], i64 [[TMP6]], [10 x i32]* [[TMP3]]) #[[ATTR4:[0-9]+]] +// CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK1: .omp.deinit: +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void // // -// CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i64 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { -// CHECK8-NEXT: entry: -// CHECK8-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK8-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK8-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 -// CHECK8-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 8 -// CHECK8-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 -// CHECK8-NEXT: [[ARGC_ADDR:%.*]] = alloca i64, align 8 -// CHECK8-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 8 -// CHECK8-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[B4:%.*]] = alloca [10 x i32], align 4 -// CHECK8-NEXT: [[I5:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [7 x i8*], align 8 -// CHECK8-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK8-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK8-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 -// CHECK8-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 8 -// CHECK8-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 -// CHECK8-NEXT: store i64 [[ARGC]], i64* [[ARGC_ADDR]], align 8 -// CHECK8-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 8 -// CHECK8-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 -// CHECK8-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 8 -// CHECK8-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 8 -// CHECK8-NEXT: [[CONV:%.*]] = bitcast i64* [[ARGC_ADDR]] to i32* -// CHECK8-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 8 -// CHECK8-NEXT: [[TMP4:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i64 40, i16 1) -// CHECK8-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to %struct._globalized_locals_ty* -// CHECK8-NEXT: [[C1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP5]], i32 0, i32 0 -// CHECK8-NEXT: [[TMP6:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK8-NEXT: store i32 [[TMP6]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK8-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK8-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP7]], 0 -// CHECK8-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK8-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK8-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK8-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK8-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK8-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP8]] -// CHECK8-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK8: omp.precond.then: -// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK8-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK8-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK8-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK8-NEXT: [[TMP10:%.*]] = bitcast [10 x i32]* [[B4]] to i8* -// CHECK8-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* -// CHECK8-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP10]], i8* align 4 [[TMP11]], i64 40, i1 false) -// CHECK8-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK8-NEXT: [[TMP12:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK8-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4 -// CHECK8-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP13]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK8-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK8-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK8-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]] -// CHECK8-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK8: cond.true: -// CHECK8-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK8-NEXT: br label [[COND_END:%.*]] -// CHECK8: cond.false: -// CHECK8-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK8-NEXT: br label [[COND_END]] -// CHECK8: cond.end: -// CHECK8-NEXT: [[COND:%.*]] = phi i32 [ [[TMP16]], [[COND_TRUE]] ], [ [[TMP17]], [[COND_FALSE]] ] -// CHECK8-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK8-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK8-NEXT: store i32 [[TMP18]], i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK8: omp.inner.for.cond: -// CHECK8-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK8-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], 1 -// CHECK8-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP19]], [[ADD]] -// CHECK8-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK8: omp.inner.for.body: -// CHECK8-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK8-NEXT: [[TMP22:%.*]] = zext i32 [[TMP21]] to i64 -// CHECK8-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK8-NEXT: [[TMP24:%.*]] = zext i32 [[TMP23]] to i64 -// CHECK8-NEXT: [[TMP25:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK8-NEXT: [[TMP26:%.*]] = inttoptr i64 [[TMP22]] to i8* -// CHECK8-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 8 -// CHECK8-NEXT: [[TMP27:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK8-NEXT: [[TMP28:%.*]] = inttoptr i64 [[TMP24]] to i8* -// CHECK8-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 8 -// CHECK8-NEXT: [[TMP29:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK8-NEXT: [[TMP30:%.*]] = bitcast i32* [[CONV]] to i8* -// CHECK8-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 8 -// CHECK8-NEXT: [[TMP31:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 -// CHECK8-NEXT: [[TMP32:%.*]] = bitcast i32* [[TMP2]] to i8* -// CHECK8-NEXT: store i8* [[TMP32]], i8** [[TMP31]], align 8 -// CHECK8-NEXT: [[TMP33:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 4 -// CHECK8-NEXT: [[TMP34:%.*]] = bitcast [10 x i32]* [[B4]] to i8* -// CHECK8-NEXT: store i8* [[TMP34]], i8** [[TMP33]], align 8 -// CHECK8-NEXT: [[TMP35:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 5 -// CHECK8-NEXT: [[TMP36:%.*]] = bitcast [10 x i32]* [[C1]] to i8* -// CHECK8-NEXT: store i8* [[TMP36]], i8** [[TMP35]], align 8 -// CHECK8-NEXT: [[TMP37:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 6 -// CHECK8-NEXT: [[TMP38:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* -// CHECK8-NEXT: store i8* [[TMP38]], i8** [[TMP37]], align 8 -// CHECK8-NEXT: [[TMP39:%.*]] = load i32, i32* [[TMP2]], align 4 -// CHECK8-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP39]], 0 -// CHECK8-NEXT: [[TMP40:%.*]] = zext i1 [[TOBOOL]] to i32 -// CHECK8-NEXT: [[TMP41:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK8-NEXT: [[TMP42:%.*]] = load i32, i32* [[TMP41]], align 4 -// CHECK8-NEXT: [[TMP43:%.*]] = bitcast [7 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK8-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP42]], i32 [[TMP40]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i32*, i32*, [10 x i32]*, [10 x i32]*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP43]], i64 7) -// CHECK8-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK8: omp.inner.for.inc: -// CHECK8-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK8-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP44]], [[TMP45]] -// CHECK8-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK8-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK8-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP46]], [[TMP47]] -// CHECK8-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK8-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK8-NEXT: [[TMP49:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK8-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP48]], [[TMP49]] -// CHECK8-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK8-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK8-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK8-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[TMP50]], [[TMP51]] -// CHECK8-NEXT: br i1 [[CMP11]], label [[COND_TRUE12:%.*]], label [[COND_FALSE13:%.*]] -// CHECK8: cond.true12: -// CHECK8-NEXT: [[TMP52:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK8-NEXT: br label [[COND_END14:%.*]] -// CHECK8: cond.false13: -// CHECK8-NEXT: [[TMP53:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK8-NEXT: br label [[COND_END14]] -// CHECK8: cond.end14: -// CHECK8-NEXT: [[COND15:%.*]] = phi i32 [ [[TMP52]], [[COND_TRUE12]] ], [ [[TMP53]], [[COND_FALSE13]] ] -// CHECK8-NEXT: store i32 [[COND15]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK8-NEXT: [[TMP54:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK8-NEXT: store i32 [[TMP54]], i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK8: omp.inner.for.end: -// CHECK8-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK8: omp.loop.exit: -// CHECK8-NEXT: [[TMP55:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK8-NEXT: [[TMP56:%.*]] = load i32, i32* [[TMP55]], align 4 -// CHECK8-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP56]]) -// CHECK8-NEXT: [[TMP57:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK8-NEXT: [[TMP58:%.*]] = icmp ne i32 [[TMP57]], 0 -// CHECK8-NEXT: br i1 [[TMP58]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK8: .omp.lastprivate.then: -// CHECK8-NEXT: [[TMP59:%.*]] = bitcast [10 x i32]* [[TMP1]] to i8* -// CHECK8-NEXT: [[TMP60:%.*]] = bitcast [10 x i32]* [[C1]] to i8* -// CHECK8-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP59]], i8* align 4 [[TMP60]], i64 40, i1 false) -// CHECK8-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK8: .omp.lastprivate.done: -// CHECK8-NEXT: br label [[OMP_PRECOND_END]] -// CHECK8: omp.precond.end: -// CHECK8-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP4]]) -// CHECK8-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i64 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[ARGC_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[B4:%.*]] = alloca [10 x i32], align 4 +// CHECK1-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [7 x i8*], align 8 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 8 +// CHECK1-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[ARGC]], i64* [[ARGC_ADDR]], align 8 +// CHECK1-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 8 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[ARGC_ADDR]] to i32* +// CHECK1-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 8 +// CHECK1-NEXT: [[C1:%.*]] = call i8* @__kmpc_alloc_shared(i64 40) +// CHECK1-NEXT: [[C_ON_STACK:%.*]] = bitcast i8* [[C1]] to [10 x i32]* +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: store i32 [[TMP4]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP5]], 0 +// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK1-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK1-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK1: omp.precond.then: +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK1-NEXT: store i32 [[TMP7]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP8:%.*]] = bitcast [10 x i32]* [[B4]] to i8* +// CHECK1-NEXT: [[TMP9:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK1-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP8]], i8* align 4 [[TMP9]], i64 40, i1 false) +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP11]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK1-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]] +// CHECK1-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK1: cond.true: +// CHECK1-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK1-NEXT: br label [[COND_END:%.*]] +// CHECK1: cond.false: +// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END]] +// CHECK1: cond.end: +// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ [[TMP14]], [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ] +// CHECK1-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP16]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], 1 +// CHECK1-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP17]], [[ADD]] +// CHECK1-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP20:%.*]] = zext i32 [[TMP19]] to i64 +// CHECK1-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP22:%.*]] = zext i32 [[TMP21]] to i64 +// CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP24:%.*]] = inttoptr i64 [[TMP20]] to i8* +// CHECK1-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 8 +// CHECK1-NEXT: [[TMP25:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP26:%.*]] = inttoptr i64 [[TMP22]] to i8* +// CHECK1-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 8 +// CHECK1-NEXT: [[TMP27:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK1-NEXT: [[TMP28:%.*]] = bitcast i32* [[CONV]] to i8* +// CHECK1-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 8 +// CHECK1-NEXT: [[TMP29:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK1-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP2]] to i8* +// CHECK1-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 8 +// CHECK1-NEXT: [[TMP31:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 4 +// CHECK1-NEXT: [[TMP32:%.*]] = bitcast [10 x i32]* [[B4]] to i8* +// CHECK1-NEXT: store i8* [[TMP32]], i8** [[TMP31]], align 8 +// CHECK1-NEXT: [[TMP33:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 5 +// CHECK1-NEXT: [[TMP34:%.*]] = bitcast [10 x i32]* [[C_ON_STACK]] to i8* +// CHECK1-NEXT: store i8* [[TMP34]], i8** [[TMP33]], align 8 +// CHECK1-NEXT: [[TMP35:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 6 +// CHECK1-NEXT: [[TMP36:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* +// CHECK1-NEXT: store i8* [[TMP36]], i8** [[TMP35]], align 8 +// CHECK1-NEXT: [[TMP37:%.*]] = load i32, i32* [[TMP2]], align 4 +// CHECK1-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP37]], 0 +// CHECK1-NEXT: [[TMP38:%.*]] = zext i1 [[TOBOOL]] to i32 +// CHECK1-NEXT: [[TMP39:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP40:%.*]] = load i32, i32* [[TMP39]], align 4 +// CHECK1-NEXT: [[TMP41:%.*]] = bitcast [7 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP40]], i32 [[TMP38]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i32*, i32*, [10 x i32]*, [10 x i32]*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP41]], i64 7) +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP42]], [[TMP43]] +// CHECK1-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP44]], [[TMP45]] +// CHECK1-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP46]], [[TMP47]] +// CHECK1-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP49:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK1-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[TMP48]], [[TMP49]] +// CHECK1-NEXT: br i1 [[CMP11]], label [[COND_TRUE12:%.*]], label [[COND_FALSE13:%.*]] +// CHECK1: cond.true12: +// CHECK1-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK1-NEXT: br label [[COND_END14:%.*]] +// CHECK1: cond.false13: +// CHECK1-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END14]] +// CHECK1: cond.end14: +// CHECK1-NEXT: [[COND15:%.*]] = phi i32 [ [[TMP50]], [[COND_TRUE12]] ], [ [[TMP51]], [[COND_FALSE13]] ] +// CHECK1-NEXT: store i32 [[COND15]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP52:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP52]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: [[TMP53:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP54:%.*]] = load i32, i32* [[TMP53]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP54]]) +// CHECK1-NEXT: [[TMP55:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP56:%.*]] = icmp ne i32 [[TMP55]], 0 +// CHECK1-NEXT: br i1 [[TMP56]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK1: .omp.lastprivate.then: +// CHECK1-NEXT: [[TMP57:%.*]] = bitcast [10 x i32]* [[TMP1]] to i8* +// CHECK1-NEXT: [[TMP58:%.*]] = bitcast [10 x i32]* [[C_ON_STACK]] to i8* +// CHECK1-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP57]], i8* align 4 [[TMP58]], i64 40, i1 false) +// CHECK1-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK1: .omp.lastprivate.done: +// CHECK1-NEXT: br label [[OMP_PRECOND_END]] +// CHECK1: omp.precond.end: +// CHECK1-NEXT: call void @__kmpc_free_shared(i8* [[C1]]) +// CHECK1-NEXT: ret void // // -// CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { -// CHECK8-NEXT: entry: -// CHECK8-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK8-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK8-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 -// CHECK8-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 -// CHECK8-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 8 -// CHECK8-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 -// CHECK8-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 -// CHECK8-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 8 -// CHECK8-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 8 -// CHECK8-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[B4:%.*]] = alloca [10 x i32], align 4 -// CHECK8-NEXT: [[C5:%.*]] = alloca [10 x i32], align 4 -// CHECK8-NEXT: [[I6:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK8-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK8-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK8-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK8-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 8 -// CHECK8-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 -// CHECK8-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 -// CHECK8-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 8 -// CHECK8-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 8 -// CHECK8-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 8 -// CHECK8-NEXT: [[TMP1:%.*]] = load i32*, i32** [[A_ADDR]], align 8 -// CHECK8-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 -// CHECK8-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 8 -// CHECK8-NEXT: [[TMP4:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 8 -// CHECK8-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP0]], align 4 -// CHECK8-NEXT: store i32 [[TMP5]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK8-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK8-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP6]], 0 -// CHECK8-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK8-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK8-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK8-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK8-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK8-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP7]] -// CHECK8-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK8: omp.precond.then: -// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK8-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK8-NEXT: store i32 [[TMP8]], i32* [[DOTOMP_UB]], align 4 -// CHECK8-NEXT: [[TMP9:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK8-NEXT: [[CONV:%.*]] = trunc i64 [[TMP9]] to i32 -// CHECK8-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK8-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP10]] to i32 -// CHECK8-NEXT: store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4 -// CHECK8-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_UB]], align 4 -// CHECK8-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK8-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[B4]] to i8* -// CHECK8-NEXT: [[TMP12:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8* -// CHECK8-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP11]], i8* align 4 [[TMP12]], i64 40, i1 false) -// CHECK8-NEXT: [[TMP13:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK8-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4 -// CHECK8-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP14]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK8-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK8-NEXT: store i32 [[TMP15]], i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK8: omp.inner.for.cond: -// CHECK8-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: [[CONV7:%.*]] = sext i32 [[TMP16]] to i64 -// CHECK8-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK8-NEXT: [[CMP8:%.*]] = icmp ule i64 [[CONV7]], [[TMP17]] -// CHECK8-NEXT: br i1 [[CMP8]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK8: omp.inner.for.body: -// CHECK8-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 -// CHECK8-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK8-NEXT: store i32 [[ADD]], i32* [[I6]], align 4 -// CHECK8-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I6]]) #[[ATTR5:[0-9]+]] -// CHECK8-NEXT: [[CALL9:%.*]] = call i32 @_Z3fooPi(i32* [[TMP1]]) #[[ATTR5]] -// CHECK8-NEXT: [[ADD10:%.*]] = add nsw i32 [[CALL]], [[CALL9]] -// CHECK8-NEXT: [[TMP19:%.*]] = load i32, i32* [[I6]], align 4 -// CHECK8-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64 -// CHECK8-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B4]], i64 0, i64 [[IDXPROM]] -// CHECK8-NEXT: [[CALL11:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX]]) #[[ATTR5]] -// CHECK8-NEXT: [[ADD12:%.*]] = add nsw i32 [[ADD10]], [[CALL11]] -// CHECK8-NEXT: [[TMP20:%.*]] = load i32, i32* [[I6]], align 4 -// CHECK8-NEXT: [[IDXPROM13:%.*]] = sext i32 [[TMP20]] to i64 -// CHECK8-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[C5]], i64 0, i64 [[IDXPROM13]] -// CHECK8-NEXT: [[CALL15:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX14]]) #[[ATTR5]] -// CHECK8-NEXT: [[ADD16:%.*]] = add nsw i32 [[ADD12]], [[CALL15]] -// CHECK8-NEXT: [[TMP21:%.*]] = load i32, i32* [[I6]], align 4 -// CHECK8-NEXT: [[IDXPROM17:%.*]] = sext i32 [[TMP21]] to i64 -// CHECK8-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP4]], i64 0, i64 [[IDXPROM17]] -// CHECK8-NEXT: [[CALL19:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX18]]) #[[ATTR5]] -// CHECK8-NEXT: [[ADD20:%.*]] = add nsw i32 [[ADD16]], [[CALL19]] -// CHECK8-NEXT: store i32 [[ADD20]], i32* [[TMP1]], align 4 -// CHECK8-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK8: omp.body.continue: -// CHECK8-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK8: omp.inner.for.inc: -// CHECK8-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK8-NEXT: [[ADD21:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK8-NEXT: store i32 [[ADD21]], i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK8: omp.inner.for.end: -// CHECK8-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK8: omp.loop.exit: -// CHECK8-NEXT: [[TMP24:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK8-NEXT: [[TMP25:%.*]] = load i32, i32* [[TMP24]], align 4 -// CHECK8-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP25]]) -// CHECK8-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK8-NEXT: [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0 -// CHECK8-NEXT: br i1 [[TMP27]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK8: .omp.lastprivate.then: -// CHECK8-NEXT: [[TMP28:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* -// CHECK8-NEXT: [[TMP29:%.*]] = bitcast [10 x i32]* [[C5]] to i8* -// CHECK8-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP28]], i8* align 4 [[TMP29]], i64 40, i1 false) -// CHECK8-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK8: .omp.lastprivate.done: -// CHECK8-NEXT: br label [[OMP_PRECOND_END]] -// CHECK8: omp.precond.end: -// CHECK8-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK1-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[B4:%.*]] = alloca [10 x i32], align 4 +// CHECK1-NEXT: [[C5:%.*]] = alloca [10 x i32], align 4 +// CHECK1-NEXT: [[I6:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 8 +// CHECK1-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK1-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 8 +// CHECK1-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK1-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 8 +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK1-NEXT: store i32 [[TMP5]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP6]], 0 +// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK1-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK1-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP7]] +// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK1: omp.precond.then: +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: store i32 [[TMP8]], i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[TMP9:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = trunc i64 [[TMP9]] to i32 +// CHECK1-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP10]] to i32 +// CHECK1-NEXT: store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[B4]] to i8* +// CHECK1-NEXT: [[TMP12:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8* +// CHECK1-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP11]], i8* align 4 [[TMP12]], i64 40, i1 false) +// CHECK1-NEXT: [[TMP13:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP14]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP15]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[CONV7:%.*]] = sext i32 [[TMP16]] to i64 +// CHECK1-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: [[CMP8:%.*]] = icmp ule i64 [[CONV7]], [[TMP17]] +// CHECK1-NEXT: br i1 [[CMP8]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK1-NEXT: store i32 [[ADD]], i32* [[I6]], align 4 +// CHECK1-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I6]]) #[[ATTR5:[0-9]+]] +// CHECK1-NEXT: [[CALL9:%.*]] = call i32 @_Z3fooPi(i32* [[TMP1]]) #[[ATTR5]] +// CHECK1-NEXT: [[ADD10:%.*]] = add nsw i32 [[CALL]], [[CALL9]] +// CHECK1-NEXT: [[TMP19:%.*]] = load i32, i32* [[I6]], align 4 +// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64 +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B4]], i64 0, i64 [[IDXPROM]] +// CHECK1-NEXT: [[CALL11:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX]]) #[[ATTR5]] +// CHECK1-NEXT: [[ADD12:%.*]] = add nsw i32 [[ADD10]], [[CALL11]] +// CHECK1-NEXT: [[TMP20:%.*]] = load i32, i32* [[I6]], align 4 +// CHECK1-NEXT: [[IDXPROM13:%.*]] = sext i32 [[TMP20]] to i64 +// CHECK1-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[C5]], i64 0, i64 [[IDXPROM13]] +// CHECK1-NEXT: [[CALL15:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX14]]) #[[ATTR5]] +// CHECK1-NEXT: [[ADD16:%.*]] = add nsw i32 [[ADD12]], [[CALL15]] +// CHECK1-NEXT: [[TMP21:%.*]] = load i32, i32* [[I6]], align 4 +// CHECK1-NEXT: [[IDXPROM17:%.*]] = sext i32 [[TMP21]] to i64 +// CHECK1-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP4]], i64 0, i64 [[IDXPROM17]] +// CHECK1-NEXT: [[CALL19:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX18]]) #[[ATTR5]] +// CHECK1-NEXT: [[ADD20:%.*]] = add nsw i32 [[ADD16]], [[CALL19]] +// CHECK1-NEXT: store i32 [[ADD20]], i32* [[TMP1]], align 4 +// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK1: omp.body.continue: +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD21:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK1-NEXT: store i32 [[ADD21]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: [[TMP24:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP25:%.*]] = load i32, i32* [[TMP24]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP25]]) +// CHECK1-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0 +// CHECK1-NEXT: br i1 [[TMP27]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK1: .omp.lastprivate.then: +// CHECK1-NEXT: [[TMP28:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* +// CHECK1-NEXT: [[TMP29:%.*]] = bitcast [10 x i32]* [[C5]] to i8* +// CHECK1-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP28]], i8* align 4 [[TMP29]], i64 40, i1 false) +// CHECK1-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK1: .omp.lastprivate.done: +// CHECK1-NEXT: br label [[OMP_PRECOND_END]] +// CHECK1: omp.precond.end: +// CHECK1-NEXT: ret void // // -// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l31 -// CHECK9-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0:[0-9]+]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK9-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK9-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 -// CHECK9-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK9-NEXT: [[ARGC_CASTED:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK9-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK9-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 -// CHECK9-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 -// CHECK9-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 -// CHECK9-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 -// CHECK9-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK9-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 -// CHECK9-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 4 -// CHECK9-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 -// CHECK9-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK9-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK9-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK9: .execute: -// CHECK9-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) -// CHECK9-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 -// CHECK9-NEXT: store i32 [[TMP5]], i32* [[ARGC_CASTED]], align 4 -// CHECK9-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARGC_CASTED]], align 4 -// CHECK9-NEXT: store i32 [[TMP4]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK9-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]], [10 x i32]* [[TMP1]], i32* [[TMP2]], i32 [[TMP6]], [10 x i32]* [[TMP3]]) #[[ATTR4:[0-9]+]] -// CHECK9-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK9: .omp.deinit: -// CHECK9-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK9-NEXT: br label [[DOTEXIT:%.*]] -// CHECK9: .exit: -// CHECK9-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25 +// CHECK2-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK2-NEXT: [[ARGC_CASTED:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK2-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 +// CHECK2-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 +// CHECK2-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK2-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK2: .execute: +// CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP5]], i32* [[ARGC_CASTED]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARGC_CASTED]], align 4 +// CHECK2-NEXT: store i32 [[TMP4]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK2-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]], [10 x i32]* [[TMP1]], i32* [[TMP2]], i32 [[TMP6]], [10 x i32]* [[TMP3]]) #[[ATTR4:[0-9]+]] +// CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK2: .omp.deinit: +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void // // -// CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK9-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK9-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK9-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 -// CHECK9-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK9-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[B4:%.*]] = alloca [10 x i32], align 4 -// CHECK9-NEXT: [[I5:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [7 x i8*], align 4 -// CHECK9-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK9-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK9-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK9-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 -// CHECK9-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 -// CHECK9-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 -// CHECK9-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 -// CHECK9-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK9-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 -// CHECK9-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 4 -// CHECK9-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 -// CHECK9-NEXT: [[TMP4:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK9-NEXT: [[TMP5:%.*]] = load i32, i32* @"_openmp_static_kernel$size", align 4 -// CHECK9-NEXT: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP5]], i16 [[TMP4]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) -// CHECK9-NEXT: [[TMP6:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4 -// CHECK9-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[TMP6]], i32 0 -// CHECK9-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP7]] to %struct._globalized_locals_ty* -// CHECK9-NEXT: [[C1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP8]], i32 0, i32 0 -// CHECK9-NEXT: [[TMP9:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 -// CHECK9-NEXT: store i32 [[TMP9]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK9-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK9-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP10]], 0 -// CHECK9-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK9-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK9-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK9-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK9-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK9-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP11]] -// CHECK9-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK9: omp.precond.then: -// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK9-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK9-NEXT: store i32 [[TMP12]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK9-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK9-NEXT: [[TMP13:%.*]] = bitcast [10 x i32]* [[B4]] to i8* -// CHECK9-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* -// CHECK9-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP13]], i8* align 4 [[TMP14]], i32 40, i1 false) -// CHECK9-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK9-NEXT: [[TMP15:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK9-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4 -// CHECK9-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP16]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK9-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK9-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK9-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]] -// CHECK9-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK9: cond.true: -// CHECK9-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK9-NEXT: br label [[COND_END:%.*]] -// CHECK9: cond.false: -// CHECK9-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK9-NEXT: br label [[COND_END]] -// CHECK9: cond.end: -// CHECK9-NEXT: [[COND:%.*]] = phi i32 [ [[TMP19]], [[COND_TRUE]] ], [ [[TMP20]], [[COND_FALSE]] ] -// CHECK9-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK9-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK9-NEXT: store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4 -// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK9: omp.inner.for.cond: -// CHECK9-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK9-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP23]], 1 -// CHECK9-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP22]], [[ADD]] -// CHECK9-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK9: omp.inner.for.body: -// CHECK9-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK9-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK9-NEXT: [[TMP26:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK9-NEXT: [[TMP27:%.*]] = inttoptr i32 [[TMP24]] to i8* -// CHECK9-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 4 -// CHECK9-NEXT: [[TMP28:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK9-NEXT: [[TMP29:%.*]] = inttoptr i32 [[TMP25]] to i8* -// CHECK9-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 4 -// CHECK9-NEXT: [[TMP30:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK9-NEXT: [[TMP31:%.*]] = bitcast i32* [[ARGC_ADDR]] to i8* -// CHECK9-NEXT: store i8* [[TMP31]], i8** [[TMP30]], align 4 -// CHECK9-NEXT: [[TMP32:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK9-NEXT: [[TMP33:%.*]] = bitcast i32* [[TMP2]] to i8* -// CHECK9-NEXT: store i8* [[TMP33]], i8** [[TMP32]], align 4 -// CHECK9-NEXT: [[TMP34:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 -// CHECK9-NEXT: [[TMP35:%.*]] = bitcast [10 x i32]* [[B4]] to i8* -// CHECK9-NEXT: store i8* [[TMP35]], i8** [[TMP34]], align 4 -// CHECK9-NEXT: [[TMP36:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 5 -// CHECK9-NEXT: [[TMP37:%.*]] = bitcast [10 x i32]* [[C1]] to i8* -// CHECK9-NEXT: store i8* [[TMP37]], i8** [[TMP36]], align 4 -// CHECK9-NEXT: [[TMP38:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 6 -// CHECK9-NEXT: [[TMP39:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* -// CHECK9-NEXT: store i8* [[TMP39]], i8** [[TMP38]], align 4 -// CHECK9-NEXT: [[TMP40:%.*]] = load i32, i32* [[TMP2]], align 4 -// CHECK9-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP40]], 0 -// CHECK9-NEXT: [[TMP41:%.*]] = zext i1 [[TOBOOL]] to i32 -// CHECK9-NEXT: [[TMP42:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK9-NEXT: [[TMP43:%.*]] = load i32, i32* [[TMP42]], align 4 -// CHECK9-NEXT: [[TMP44:%.*]] = bitcast [7 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK9-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP43]], i32 [[TMP41]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32*, i32*, [10 x i32]*, [10 x i32]*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP44]], i32 7) -// CHECK9-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK9: omp.inner.for.inc: -// CHECK9-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK9-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK9-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP45]], [[TMP46]] -// CHECK9-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_IV]], align 4 -// CHECK9-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK9-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK9-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP47]], [[TMP48]] -// CHECK9-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK9-NEXT: [[TMP49:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK9-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK9-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP49]], [[TMP50]] -// CHECK9-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK9-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK9-NEXT: [[TMP52:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK9-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[TMP51]], [[TMP52]] -// CHECK9-NEXT: br i1 [[CMP11]], label [[COND_TRUE12:%.*]], label [[COND_FALSE13:%.*]] -// CHECK9: cond.true12: -// CHECK9-NEXT: [[TMP53:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK9-NEXT: br label [[COND_END14:%.*]] -// CHECK9: cond.false13: -// CHECK9-NEXT: [[TMP54:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK9-NEXT: br label [[COND_END14]] -// CHECK9: cond.end14: -// CHECK9-NEXT: [[COND15:%.*]] = phi i32 [ [[TMP53]], [[COND_TRUE12]] ], [ [[TMP54]], [[COND_FALSE13]] ] -// CHECK9-NEXT: store i32 [[COND15]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK9-NEXT: [[TMP55:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK9-NEXT: store i32 [[TMP55]], i32* [[DOTOMP_IV]], align 4 -// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK9: omp.inner.for.end: -// CHECK9-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK9: omp.loop.exit: -// CHECK9-NEXT: [[TMP56:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK9-NEXT: [[TMP57:%.*]] = load i32, i32* [[TMP56]], align 4 -// CHECK9-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP57]]) -// CHECK9-NEXT: [[TMP58:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK9-NEXT: [[TMP59:%.*]] = icmp ne i32 [[TMP58]], 0 -// CHECK9-NEXT: br i1 [[TMP59]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK9: .omp.lastprivate.then: -// CHECK9-NEXT: [[TMP60:%.*]] = bitcast [10 x i32]* [[TMP1]] to i8* -// CHECK9-NEXT: [[TMP61:%.*]] = bitcast [10 x i32]* [[C1]] to i8* -// CHECK9-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP60]], i8* align 4 [[TMP61]], i32 40, i1 false) -// CHECK9-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK9: .omp.lastprivate.done: -// CHECK9-NEXT: br label [[OMP_PRECOND_END]] -// CHECK9: omp.precond.end: -// CHECK9-NEXT: [[TMP62:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK9-NEXT: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[TMP62]]) -// CHECK9-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[B4:%.*]] = alloca [10 x i32], align 4 +// CHECK2-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [7 x i8*], align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK2-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK2-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 +// CHECK2-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 +// CHECK2-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK2-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 +// CHECK2-NEXT: [[C1:%.*]] = call i8* @__kmpc_alloc_shared(i32 40) +// CHECK2-NEXT: [[C_ON_STACK:%.*]] = bitcast i8* [[C1]] to [10 x i32]* +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP4]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP5]], 0 +// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK2-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK2-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK2: omp.precond.then: +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: store i32 [[TMP7]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = bitcast [10 x i32]* [[B4]] to i8* +// CHECK2-NEXT: [[TMP9:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK2-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP8]], i8* align 4 [[TMP9]], i32 40, i1 false) +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP11]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]] +// CHECK2-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK2: cond.true: +// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: br label [[COND_END:%.*]] +// CHECK2: cond.false: +// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END]] +// CHECK2: cond.end: +// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ [[TMP14]], [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ] +// CHECK2-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP16]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], 1 +// CHECK2-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP17]], [[ADD]] +// CHECK2-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP21:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP22:%.*]] = inttoptr i32 [[TMP19]] to i8* +// CHECK2-NEXT: store i8* [[TMP22]], i8** [[TMP21]], align 4 +// CHECK2-NEXT: [[TMP23:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP20]] to i8* +// CHECK2-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 +// CHECK2-NEXT: [[TMP25:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK2-NEXT: [[TMP26:%.*]] = bitcast i32* [[ARGC_ADDR]] to i8* +// CHECK2-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 +// CHECK2-NEXT: [[TMP27:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK2-NEXT: [[TMP28:%.*]] = bitcast i32* [[TMP2]] to i8* +// CHECK2-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 +// CHECK2-NEXT: [[TMP29:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK2-NEXT: [[TMP30:%.*]] = bitcast [10 x i32]* [[B4]] to i8* +// CHECK2-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 4 +// CHECK2-NEXT: [[TMP31:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 5 +// CHECK2-NEXT: [[TMP32:%.*]] = bitcast [10 x i32]* [[C_ON_STACK]] to i8* +// CHECK2-NEXT: store i8* [[TMP32]], i8** [[TMP31]], align 4 +// CHECK2-NEXT: [[TMP33:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 6 +// CHECK2-NEXT: [[TMP34:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* +// CHECK2-NEXT: store i8* [[TMP34]], i8** [[TMP33]], align 4 +// CHECK2-NEXT: [[TMP35:%.*]] = load i32, i32* [[TMP2]], align 4 +// CHECK2-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP35]], 0 +// CHECK2-NEXT: [[TMP36:%.*]] = zext i1 [[TOBOOL]] to i32 +// CHECK2-NEXT: [[TMP37:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: [[TMP38:%.*]] = load i32, i32* [[TMP37]], align 4 +// CHECK2-NEXT: [[TMP39:%.*]] = bitcast [7 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP38]], i32 [[TMP36]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32*, i32*, [10 x i32]*, [10 x i32]*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP39]], i32 7) +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP40]], [[TMP41]] +// CHECK2-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP42]], [[TMP43]] +// CHECK2-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP44]], [[TMP45]] +// CHECK2-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[TMP46]], [[TMP47]] +// CHECK2-NEXT: br i1 [[CMP11]], label [[COND_TRUE12:%.*]], label [[COND_FALSE13:%.*]] +// CHECK2: cond.true12: +// CHECK2-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: br label [[COND_END14:%.*]] +// CHECK2: cond.false13: +// CHECK2-NEXT: [[TMP49:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END14]] +// CHECK2: cond.end14: +// CHECK2-NEXT: [[COND15:%.*]] = phi i32 [ [[TMP48]], [[COND_TRUE12]] ], [ [[TMP49]], [[COND_FALSE13]] ] +// CHECK2-NEXT: store i32 [[COND15]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP50]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK2: omp.loop.exit: +// CHECK2-NEXT: [[TMP51:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: [[TMP52:%.*]] = load i32, i32* [[TMP51]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP52]]) +// CHECK2-NEXT: [[TMP53:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP54:%.*]] = icmp ne i32 [[TMP53]], 0 +// CHECK2-NEXT: br i1 [[TMP54]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK2: .omp.lastprivate.then: +// CHECK2-NEXT: [[TMP55:%.*]] = bitcast [10 x i32]* [[TMP1]] to i8* +// CHECK2-NEXT: [[TMP56:%.*]] = bitcast [10 x i32]* [[C_ON_STACK]] to i8* +// CHECK2-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP55]], i8* align 4 [[TMP56]], i32 40, i1 false) +// CHECK2-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK2: .omp.lastprivate.done: +// CHECK2-NEXT: br label [[OMP_PRECOND_END]] +// CHECK2: omp.precond.end: +// CHECK2-NEXT: call void @__kmpc_free_shared(i8* [[C1]]) +// CHECK2-NEXT: ret void // // -// CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK9-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 4 -// CHECK9-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 -// CHECK9-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK9-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK9-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK9-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[B3:%.*]] = alloca [10 x i32], align 4 -// CHECK9-NEXT: [[C4:%.*]] = alloca [10 x i32], align 4 -// CHECK9-NEXT: [[I5:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK9-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK9-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK9-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK9-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 4 -// CHECK9-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 -// CHECK9-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK9-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 -// CHECK9-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 -// CHECK9-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 4 -// CHECK9-NEXT: [[TMP1:%.*]] = load i32*, i32** [[A_ADDR]], align 4 -// CHECK9-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK9-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 -// CHECK9-NEXT: [[TMP4:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 -// CHECK9-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP0]], align 4 -// CHECK9-NEXT: store i32 [[TMP5]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK9-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK9-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP6]], 0 -// CHECK9-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK9-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK9-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK9-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK9-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK9-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP7]] -// CHECK9-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK9: omp.precond.then: -// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK9-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK9-NEXT: store i32 [[TMP8]], i32* [[DOTOMP_UB]], align 4 -// CHECK9-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK9-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK9-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_LB]], align 4 -// CHECK9-NEXT: store i32 [[TMP10]], i32* [[DOTOMP_UB]], align 4 -// CHECK9-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK9-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[B3]] to i8* -// CHECK9-NEXT: [[TMP12:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8* -// CHECK9-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP11]], i8* align 4 [[TMP12]], i32 40, i1 false) -// CHECK9-NEXT: [[TMP13:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK9-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4 -// CHECK9-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP14]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK9-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK9-NEXT: store i32 [[TMP15]], i32* [[DOTOMP_IV]], align 4 -// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK9: omp.inner.for.cond: -// CHECK9-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK9-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK9-NEXT: [[CMP6:%.*]] = icmp ule i32 [[TMP16]], [[TMP17]] -// CHECK9-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK9: omp.inner.for.body: -// CHECK9-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK9-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 -// CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK9-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 -// CHECK9-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I5]]) #[[ATTR5:[0-9]+]] -// CHECK9-NEXT: [[CALL7:%.*]] = call i32 @_Z3fooPi(i32* [[TMP1]]) #[[ATTR5]] -// CHECK9-NEXT: [[ADD8:%.*]] = add nsw i32 [[CALL]], [[CALL7]] -// CHECK9-NEXT: [[TMP19:%.*]] = load i32, i32* [[I5]], align 4 -// CHECK9-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B3]], i32 0, i32 [[TMP19]] -// CHECK9-NEXT: [[CALL9:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX]]) #[[ATTR5]] -// CHECK9-NEXT: [[ADD10:%.*]] = add nsw i32 [[ADD8]], [[CALL9]] -// CHECK9-NEXT: [[TMP20:%.*]] = load i32, i32* [[I5]], align 4 -// CHECK9-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[C4]], i32 0, i32 [[TMP20]] -// CHECK9-NEXT: [[CALL12:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX11]]) #[[ATTR5]] -// CHECK9-NEXT: [[ADD13:%.*]] = add nsw i32 [[ADD10]], [[CALL12]] -// CHECK9-NEXT: [[TMP21:%.*]] = load i32, i32* [[I5]], align 4 -// CHECK9-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP4]], i32 0, i32 [[TMP21]] -// CHECK9-NEXT: [[CALL15:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX14]]) #[[ATTR5]] -// CHECK9-NEXT: [[ADD16:%.*]] = add nsw i32 [[ADD13]], [[CALL15]] -// CHECK9-NEXT: store i32 [[ADD16]], i32* [[TMP1]], align 4 -// CHECK9-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK9: omp.body.continue: -// CHECK9-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK9: omp.inner.for.inc: -// CHECK9-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK9-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK9-NEXT: [[ADD17:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK9-NEXT: store i32 [[ADD17]], i32* [[DOTOMP_IV]], align 4 -// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK9: omp.inner.for.end: -// CHECK9-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK9: omp.loop.exit: -// CHECK9-NEXT: [[TMP24:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK9-NEXT: [[TMP25:%.*]] = load i32, i32* [[TMP24]], align 4 -// CHECK9-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP25]]) -// CHECK9-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK9-NEXT: [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0 -// CHECK9-NEXT: br i1 [[TMP27]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK9: .omp.lastprivate.then: -// CHECK9-NEXT: [[TMP28:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* -// CHECK9-NEXT: [[TMP29:%.*]] = bitcast [10 x i32]* [[C4]] to i8* -// CHECK9-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP28]], i8* align 4 [[TMP29]], i32 40, i1 false) -// CHECK9-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK9: .omp.lastprivate.done: -// CHECK9-NEXT: br label [[OMP_PRECOND_END]] -// CHECK9: omp.precond.end: -// CHECK9-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK2-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[B3:%.*]] = alloca [10 x i32], align 4 +// CHECK2-NEXT: [[C4:%.*]] = alloca [10 x i32], align 4 +// CHECK2-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK2-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 4 +// CHECK2-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK2-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK2-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 +// CHECK2-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 4 +// CHECK2-NEXT: [[TMP1:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK2-NEXT: store i32 [[TMP5]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP6]], 0 +// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK2-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK2-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP7]] +// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK2: omp.precond.then: +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: store i32 [[TMP8]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP10]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[B3]] to i8* +// CHECK2-NEXT: [[TMP12:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8* +// CHECK2-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP11]], i8* align 4 [[TMP12]], i32 40, i1 false) +// CHECK2-NEXT: [[TMP13:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP14]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP15]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK2-NEXT: [[CMP6:%.*]] = icmp ule i32 [[TMP16]], [[TMP17]] +// CHECK2-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK2-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 +// CHECK2-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I5]]) #[[ATTR5:[0-9]+]] +// CHECK2-NEXT: [[CALL7:%.*]] = call i32 @_Z3fooPi(i32* [[TMP1]]) #[[ATTR5]] +// CHECK2-NEXT: [[ADD8:%.*]] = add nsw i32 [[CALL]], [[CALL7]] +// CHECK2-NEXT: [[TMP19:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B3]], i32 0, i32 [[TMP19]] +// CHECK2-NEXT: [[CALL9:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX]]) #[[ATTR5]] +// CHECK2-NEXT: [[ADD10:%.*]] = add nsw i32 [[ADD8]], [[CALL9]] +// CHECK2-NEXT: [[TMP20:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK2-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[C4]], i32 0, i32 [[TMP20]] +// CHECK2-NEXT: [[CALL12:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX11]]) #[[ATTR5]] +// CHECK2-NEXT: [[ADD13:%.*]] = add nsw i32 [[ADD10]], [[CALL12]] +// CHECK2-NEXT: [[TMP21:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK2-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP4]], i32 0, i32 [[TMP21]] +// CHECK2-NEXT: [[CALL15:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX14]]) #[[ATTR5]] +// CHECK2-NEXT: [[ADD16:%.*]] = add nsw i32 [[ADD13]], [[CALL15]] +// CHECK2-NEXT: store i32 [[ADD16]], i32* [[TMP1]], align 4 +// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK2: omp.body.continue: +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD17:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK2-NEXT: store i32 [[ADD17]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK2: omp.loop.exit: +// CHECK2-NEXT: [[TMP24:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: [[TMP25:%.*]] = load i32, i32* [[TMP24]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP25]]) +// CHECK2-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0 +// CHECK2-NEXT: br i1 [[TMP27]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK2: .omp.lastprivate.then: +// CHECK2-NEXT: [[TMP28:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* +// CHECK2-NEXT: [[TMP29:%.*]] = bitcast [10 x i32]* [[C4]] to i8* +// CHECK2-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP28]], i8* align 4 [[TMP29]], i32 40, i1 false) +// CHECK2-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK2: .omp.lastprivate.done: +// CHECK2-NEXT: br label [[OMP_PRECOND_END]] +// CHECK2: omp.precond.end: +// CHECK2-NEXT: ret void // // -// CHECK10-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l31 -// CHECK10-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0:[0-9]+]] { -// CHECK10-NEXT: entry: -// CHECK10-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK10-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK10-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 -// CHECK10-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK10-NEXT: [[ARGC_CASTED:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK10-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK10-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 -// CHECK10-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 -// CHECK10-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 -// CHECK10-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 -// CHECK10-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK10-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 -// CHECK10-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 4 -// CHECK10-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 -// CHECK10-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK10-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK10-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK10: .execute: -// CHECK10-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) -// CHECK10-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 -// CHECK10-NEXT: store i32 [[TMP5]], i32* [[ARGC_CASTED]], align 4 -// CHECK10-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARGC_CASTED]], align 4 -// CHECK10-NEXT: store i32 [[TMP4]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK10-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]], [10 x i32]* [[TMP1]], i32* [[TMP2]], i32 [[TMP6]], [10 x i32]* [[TMP3]]) #[[ATTR4:[0-9]+]] -// CHECK10-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK10: .omp.deinit: -// CHECK10-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK10-NEXT: br label [[DOTEXIT:%.*]] -// CHECK10: .exit: -// CHECK10-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25 +// CHECK3-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK3-NEXT: [[ARGC_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK3-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK3-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 +// CHECK3-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 +// CHECK3-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK3: .execute: +// CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP5]], i32* [[ARGC_CASTED]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARGC_CASTED]], align 4 +// CHECK3-NEXT: store i32 [[TMP4]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK3-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]], [10 x i32]* [[TMP1]], i32* [[TMP2]], i32 [[TMP6]], [10 x i32]* [[TMP3]]) #[[ATTR4:[0-9]+]] +// CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK3: .omp.deinit: +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void // // -// CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { -// CHECK10-NEXT: entry: -// CHECK10-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK10-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK10-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK10-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK10-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 -// CHECK10-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK10-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[B4:%.*]] = alloca [10 x i32], align 4 -// CHECK10-NEXT: [[I5:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [7 x i8*], align 4 -// CHECK10-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK10-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK10-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK10-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 -// CHECK10-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 -// CHECK10-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 -// CHECK10-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 -// CHECK10-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK10-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 -// CHECK10-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 4 -// CHECK10-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 -// CHECK10-NEXT: [[TMP4:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK10-NEXT: [[TMP5:%.*]] = load i32, i32* @"_openmp_static_kernel$size", align 4 -// CHECK10-NEXT: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP5]], i16 [[TMP4]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) -// CHECK10-NEXT: [[TMP6:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4 -// CHECK10-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[TMP6]], i32 0 -// CHECK10-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP7]] to %struct._globalized_locals_ty* -// CHECK10-NEXT: [[C1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP8]], i32 0, i32 0 -// CHECK10-NEXT: [[TMP9:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 -// CHECK10-NEXT: store i32 [[TMP9]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK10-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK10-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP10]], 0 -// CHECK10-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK10-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK10-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK10-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK10-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK10-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP11]] -// CHECK10-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK10: omp.precond.then: -// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK10-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK10-NEXT: store i32 [[TMP12]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK10-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK10-NEXT: [[TMP13:%.*]] = bitcast [10 x i32]* [[B4]] to i8* -// CHECK10-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* -// CHECK10-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP13]], i8* align 4 [[TMP14]], i32 40, i1 false) -// CHECK10-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK10-NEXT: [[TMP15:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK10-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4 -// CHECK10-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP16]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK10-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK10-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK10-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]] -// CHECK10-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK10: cond.true: -// CHECK10-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK10-NEXT: br label [[COND_END:%.*]] -// CHECK10: cond.false: -// CHECK10-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK10-NEXT: br label [[COND_END]] -// CHECK10: cond.end: -// CHECK10-NEXT: [[COND:%.*]] = phi i32 [ [[TMP19]], [[COND_TRUE]] ], [ [[TMP20]], [[COND_FALSE]] ] -// CHECK10-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK10-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK10-NEXT: store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4 -// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK10: omp.inner.for.cond: -// CHECK10-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK10-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK10-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP23]], 1 -// CHECK10-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP22]], [[ADD]] -// CHECK10-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK10: omp.inner.for.body: -// CHECK10-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK10-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK10-NEXT: [[TMP26:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK10-NEXT: [[TMP27:%.*]] = inttoptr i32 [[TMP24]] to i8* -// CHECK10-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 4 -// CHECK10-NEXT: [[TMP28:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK10-NEXT: [[TMP29:%.*]] = inttoptr i32 [[TMP25]] to i8* -// CHECK10-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 4 -// CHECK10-NEXT: [[TMP30:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK10-NEXT: [[TMP31:%.*]] = bitcast i32* [[ARGC_ADDR]] to i8* -// CHECK10-NEXT: store i8* [[TMP31]], i8** [[TMP30]], align 4 -// CHECK10-NEXT: [[TMP32:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK10-NEXT: [[TMP33:%.*]] = bitcast i32* [[TMP2]] to i8* -// CHECK10-NEXT: store i8* [[TMP33]], i8** [[TMP32]], align 4 -// CHECK10-NEXT: [[TMP34:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 -// CHECK10-NEXT: [[TMP35:%.*]] = bitcast [10 x i32]* [[B4]] to i8* -// CHECK10-NEXT: store i8* [[TMP35]], i8** [[TMP34]], align 4 -// CHECK10-NEXT: [[TMP36:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 5 -// CHECK10-NEXT: [[TMP37:%.*]] = bitcast [10 x i32]* [[C1]] to i8* -// CHECK10-NEXT: store i8* [[TMP37]], i8** [[TMP36]], align 4 -// CHECK10-NEXT: [[TMP38:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 6 -// CHECK10-NEXT: [[TMP39:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* -// CHECK10-NEXT: store i8* [[TMP39]], i8** [[TMP38]], align 4 -// CHECK10-NEXT: [[TMP40:%.*]] = load i32, i32* [[TMP2]], align 4 -// CHECK10-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP40]], 0 -// CHECK10-NEXT: [[TMP41:%.*]] = zext i1 [[TOBOOL]] to i32 -// CHECK10-NEXT: [[TMP42:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK10-NEXT: [[TMP43:%.*]] = load i32, i32* [[TMP42]], align 4 -// CHECK10-NEXT: [[TMP44:%.*]] = bitcast [7 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK10-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP43]], i32 [[TMP41]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32*, i32*, [10 x i32]*, [10 x i32]*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP44]], i32 7) -// CHECK10-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK10: omp.inner.for.inc: -// CHECK10-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK10-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK10-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP45]], [[TMP46]] -// CHECK10-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_IV]], align 4 -// CHECK10-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK10-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK10-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP47]], [[TMP48]] -// CHECK10-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK10-NEXT: [[TMP49:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK10-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK10-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP49]], [[TMP50]] -// CHECK10-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK10-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK10-NEXT: [[TMP52:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK10-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[TMP51]], [[TMP52]] -// CHECK10-NEXT: br i1 [[CMP11]], label [[COND_TRUE12:%.*]], label [[COND_FALSE13:%.*]] -// CHECK10: cond.true12: -// CHECK10-NEXT: [[TMP53:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK10-NEXT: br label [[COND_END14:%.*]] -// CHECK10: cond.false13: -// CHECK10-NEXT: [[TMP54:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK10-NEXT: br label [[COND_END14]] -// CHECK10: cond.end14: -// CHECK10-NEXT: [[COND15:%.*]] = phi i32 [ [[TMP53]], [[COND_TRUE12]] ], [ [[TMP54]], [[COND_FALSE13]] ] -// CHECK10-NEXT: store i32 [[COND15]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK10-NEXT: [[TMP55:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK10-NEXT: store i32 [[TMP55]], i32* [[DOTOMP_IV]], align 4 -// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK10: omp.inner.for.end: -// CHECK10-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK10: omp.loop.exit: -// CHECK10-NEXT: [[TMP56:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK10-NEXT: [[TMP57:%.*]] = load i32, i32* [[TMP56]], align 4 -// CHECK10-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP57]]) -// CHECK10-NEXT: [[TMP58:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK10-NEXT: [[TMP59:%.*]] = icmp ne i32 [[TMP58]], 0 -// CHECK10-NEXT: br i1 [[TMP59]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK10: .omp.lastprivate.then: -// CHECK10-NEXT: [[TMP60:%.*]] = bitcast [10 x i32]* [[TMP1]] to i8* -// CHECK10-NEXT: [[TMP61:%.*]] = bitcast [10 x i32]* [[C1]] to i8* -// CHECK10-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP60]], i8* align 4 [[TMP61]], i32 40, i1 false) -// CHECK10-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK10: .omp.lastprivate.done: -// CHECK10-NEXT: br label [[OMP_PRECOND_END]] -// CHECK10: omp.precond.end: -// CHECK10-NEXT: [[TMP62:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK10-NEXT: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[TMP62]]) -// CHECK10-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[B4:%.*]] = alloca [10 x i32], align 4 +// CHECK3-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [7 x i8*], align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK3-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 +// CHECK3-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 +// CHECK3-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 +// CHECK3-NEXT: [[C1:%.*]] = call i8* @__kmpc_alloc_shared(i32 40) +// CHECK3-NEXT: [[C_ON_STACK:%.*]] = bitcast i8* [[C1]] to [10 x i32]* +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP4]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP5]], 0 +// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK3-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK3-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK3: omp.precond.then: +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: store i32 [[TMP7]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = bitcast [10 x i32]* [[B4]] to i8* +// CHECK3-NEXT: [[TMP9:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK3-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP8]], i8* align 4 [[TMP9]], i32 40, i1 false) +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP11]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]] +// CHECK3-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK3: cond.true: +// CHECK3-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: br label [[COND_END:%.*]] +// CHECK3: cond.false: +// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END]] +// CHECK3: cond.end: +// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ [[TMP14]], [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ] +// CHECK3-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP16]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], 1 +// CHECK3-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP17]], [[ADD]] +// CHECK3-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP21:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP22:%.*]] = inttoptr i32 [[TMP19]] to i8* +// CHECK3-NEXT: store i8* [[TMP22]], i8** [[TMP21]], align 4 +// CHECK3-NEXT: [[TMP23:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP20]] to i8* +// CHECK3-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 +// CHECK3-NEXT: [[TMP25:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK3-NEXT: [[TMP26:%.*]] = bitcast i32* [[ARGC_ADDR]] to i8* +// CHECK3-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 +// CHECK3-NEXT: [[TMP27:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK3-NEXT: [[TMP28:%.*]] = bitcast i32* [[TMP2]] to i8* +// CHECK3-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 +// CHECK3-NEXT: [[TMP29:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK3-NEXT: [[TMP30:%.*]] = bitcast [10 x i32]* [[B4]] to i8* +// CHECK3-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 4 +// CHECK3-NEXT: [[TMP31:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 5 +// CHECK3-NEXT: [[TMP32:%.*]] = bitcast [10 x i32]* [[C_ON_STACK]] to i8* +// CHECK3-NEXT: store i8* [[TMP32]], i8** [[TMP31]], align 4 +// CHECK3-NEXT: [[TMP33:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 6 +// CHECK3-NEXT: [[TMP34:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* +// CHECK3-NEXT: store i8* [[TMP34]], i8** [[TMP33]], align 4 +// CHECK3-NEXT: [[TMP35:%.*]] = load i32, i32* [[TMP2]], align 4 +// CHECK3-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP35]], 0 +// CHECK3-NEXT: [[TMP36:%.*]] = zext i1 [[TOBOOL]] to i32 +// CHECK3-NEXT: [[TMP37:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP38:%.*]] = load i32, i32* [[TMP37]], align 4 +// CHECK3-NEXT: [[TMP39:%.*]] = bitcast [7 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP38]], i32 [[TMP36]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32*, i32*, [10 x i32]*, [10 x i32]*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP39]], i32 7) +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP40]], [[TMP41]] +// CHECK3-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP42]], [[TMP43]] +// CHECK3-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP44]], [[TMP45]] +// CHECK3-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[TMP46]], [[TMP47]] +// CHECK3-NEXT: br i1 [[CMP11]], label [[COND_TRUE12:%.*]], label [[COND_FALSE13:%.*]] +// CHECK3: cond.true12: +// CHECK3-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: br label [[COND_END14:%.*]] +// CHECK3: cond.false13: +// CHECK3-NEXT: [[TMP49:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END14]] +// CHECK3: cond.end14: +// CHECK3-NEXT: [[COND15:%.*]] = phi i32 [ [[TMP48]], [[COND_TRUE12]] ], [ [[TMP49]], [[COND_FALSE13]] ] +// CHECK3-NEXT: store i32 [[COND15]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP50]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK3: omp.loop.exit: +// CHECK3-NEXT: [[TMP51:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP52:%.*]] = load i32, i32* [[TMP51]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP52]]) +// CHECK3-NEXT: [[TMP53:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP54:%.*]] = icmp ne i32 [[TMP53]], 0 +// CHECK3-NEXT: br i1 [[TMP54]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK3: .omp.lastprivate.then: +// CHECK3-NEXT: [[TMP55:%.*]] = bitcast [10 x i32]* [[TMP1]] to i8* +// CHECK3-NEXT: [[TMP56:%.*]] = bitcast [10 x i32]* [[C_ON_STACK]] to i8* +// CHECK3-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP55]], i8* align 4 [[TMP56]], i32 40, i1 false) +// CHECK3-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK3: .omp.lastprivate.done: +// CHECK3-NEXT: br label [[OMP_PRECOND_END]] +// CHECK3: omp.precond.end: +// CHECK3-NEXT: call void @__kmpc_free_shared(i8* [[C1]]) +// CHECK3-NEXT: ret void // // -// CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { -// CHECK10-NEXT: entry: -// CHECK10-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK10-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK10-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 4 -// CHECK10-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 -// CHECK10-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK10-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK10-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK10-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[B3:%.*]] = alloca [10 x i32], align 4 -// CHECK10-NEXT: [[C4:%.*]] = alloca [10 x i32], align 4 -// CHECK10-NEXT: [[I5:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK10-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK10-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK10-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK10-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 4 -// CHECK10-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 -// CHECK10-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK10-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 -// CHECK10-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 -// CHECK10-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 4 -// CHECK10-NEXT: [[TMP1:%.*]] = load i32*, i32** [[A_ADDR]], align 4 -// CHECK10-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK10-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 -// CHECK10-NEXT: [[TMP4:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 -// CHECK10-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP0]], align 4 -// CHECK10-NEXT: store i32 [[TMP5]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK10-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK10-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP6]], 0 -// CHECK10-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK10-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK10-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK10-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK10-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK10-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP7]] -// CHECK10-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK10: omp.precond.then: -// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK10-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK10-NEXT: store i32 [[TMP8]], i32* [[DOTOMP_UB]], align 4 -// CHECK10-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK10-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK10-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_LB]], align 4 -// CHECK10-NEXT: store i32 [[TMP10]], i32* [[DOTOMP_UB]], align 4 -// CHECK10-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK10-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[B3]] to i8* -// CHECK10-NEXT: [[TMP12:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8* -// CHECK10-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP11]], i8* align 4 [[TMP12]], i32 40, i1 false) -// CHECK10-NEXT: [[TMP13:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK10-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4 -// CHECK10-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP14]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK10-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK10-NEXT: store i32 [[TMP15]], i32* [[DOTOMP_IV]], align 4 -// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK10: omp.inner.for.cond: -// CHECK10-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK10-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK10-NEXT: [[CMP6:%.*]] = icmp ule i32 [[TMP16]], [[TMP17]] -// CHECK10-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK10: omp.inner.for.body: -// CHECK10-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK10-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 -// CHECK10-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK10-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 -// CHECK10-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I5]]) #[[ATTR5:[0-9]+]] -// CHECK10-NEXT: [[CALL7:%.*]] = call i32 @_Z3fooPi(i32* [[TMP1]]) #[[ATTR5]] -// CHECK10-NEXT: [[ADD8:%.*]] = add nsw i32 [[CALL]], [[CALL7]] -// CHECK10-NEXT: [[TMP19:%.*]] = load i32, i32* [[I5]], align 4 -// CHECK10-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B3]], i32 0, i32 [[TMP19]] -// CHECK10-NEXT: [[CALL9:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX]]) #[[ATTR5]] -// CHECK10-NEXT: [[ADD10:%.*]] = add nsw i32 [[ADD8]], [[CALL9]] -// CHECK10-NEXT: [[TMP20:%.*]] = load i32, i32* [[I5]], align 4 -// CHECK10-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[C4]], i32 0, i32 [[TMP20]] -// CHECK10-NEXT: [[CALL12:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX11]]) #[[ATTR5]] -// CHECK10-NEXT: [[ADD13:%.*]] = add nsw i32 [[ADD10]], [[CALL12]] -// CHECK10-NEXT: [[TMP21:%.*]] = load i32, i32* [[I5]], align 4 -// CHECK10-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP4]], i32 0, i32 [[TMP21]] -// CHECK10-NEXT: [[CALL15:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX14]]) #[[ATTR5]] -// CHECK10-NEXT: [[ADD16:%.*]] = add nsw i32 [[ADD13]], [[CALL15]] -// CHECK10-NEXT: store i32 [[ADD16]], i32* [[TMP1]], align 4 -// CHECK10-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK10: omp.body.continue: -// CHECK10-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK10: omp.inner.for.inc: -// CHECK10-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK10-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK10-NEXT: [[ADD17:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK10-NEXT: store i32 [[ADD17]], i32* [[DOTOMP_IV]], align 4 -// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK10: omp.inner.for.end: -// CHECK10-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK10: omp.loop.exit: -// CHECK10-NEXT: [[TMP24:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK10-NEXT: [[TMP25:%.*]] = load i32, i32* [[TMP24]], align 4 -// CHECK10-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP25]]) -// CHECK10-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK10-NEXT: [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0 -// CHECK10-NEXT: br i1 [[TMP27]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK10: .omp.lastprivate.then: -// CHECK10-NEXT: [[TMP28:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* -// CHECK10-NEXT: [[TMP29:%.*]] = bitcast [10 x i32]* [[C4]] to i8* -// CHECK10-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP28]], i8* align 4 [[TMP29]], i32 40, i1 false) -// CHECK10-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK10: .omp.lastprivate.done: -// CHECK10-NEXT: br label [[OMP_PRECOND_END]] -// CHECK10: omp.precond.end: -// CHECK10-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK3-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[B3:%.*]] = alloca [10 x i32], align 4 +// CHECK3-NEXT: [[C4:%.*]] = alloca [10 x i32], align 4 +// CHECK3-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 4 +// CHECK3-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK3-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK3-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 +// CHECK3-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK3-NEXT: store i32 [[TMP5]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP6]], 0 +// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK3-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK3-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP7]] +// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK3: omp.precond.then: +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: store i32 [[TMP8]], i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP10]], i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[B3]] to i8* +// CHECK3-NEXT: [[TMP12:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8* +// CHECK3-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP11]], i8* align 4 [[TMP12]], i32 40, i1 false) +// CHECK3-NEXT: [[TMP13:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP14]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP15]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: [[CMP6:%.*]] = icmp ule i32 [[TMP16]], [[TMP17]] +// CHECK3-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK3-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 +// CHECK3-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I5]]) #[[ATTR5:[0-9]+]] +// CHECK3-NEXT: [[CALL7:%.*]] = call i32 @_Z3fooPi(i32* [[TMP1]]) #[[ATTR5]] +// CHECK3-NEXT: [[ADD8:%.*]] = add nsw i32 [[CALL]], [[CALL7]] +// CHECK3-NEXT: [[TMP19:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B3]], i32 0, i32 [[TMP19]] +// CHECK3-NEXT: [[CALL9:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX]]) #[[ATTR5]] +// CHECK3-NEXT: [[ADD10:%.*]] = add nsw i32 [[ADD8]], [[CALL9]] +// CHECK3-NEXT: [[TMP20:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK3-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[C4]], i32 0, i32 [[TMP20]] +// CHECK3-NEXT: [[CALL12:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX11]]) #[[ATTR5]] +// CHECK3-NEXT: [[ADD13:%.*]] = add nsw i32 [[ADD10]], [[CALL12]] +// CHECK3-NEXT: [[TMP21:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK3-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP4]], i32 0, i32 [[TMP21]] +// CHECK3-NEXT: [[CALL15:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX14]]) #[[ATTR5]] +// CHECK3-NEXT: [[ADD16:%.*]] = add nsw i32 [[ADD13]], [[CALL15]] +// CHECK3-NEXT: store i32 [[ADD16]], i32* [[TMP1]], align 4 +// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK3: omp.body.continue: +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD17:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK3-NEXT: store i32 [[ADD17]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK3: omp.loop.exit: +// CHECK3-NEXT: [[TMP24:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP25:%.*]] = load i32, i32* [[TMP24]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP25]]) +// CHECK3-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0 +// CHECK3-NEXT: br i1 [[TMP27]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK3: .omp.lastprivate.then: +// CHECK3-NEXT: [[TMP28:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* +// CHECK3-NEXT: [[TMP29:%.*]] = bitcast [10 x i32]* [[C4]] to i8* +// CHECK3-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP28]], i8* align 4 [[TMP29]], i32 40, i1 false) +// CHECK3-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK3: .omp.lastprivate.done: +// CHECK3-NEXT: br label [[OMP_PRECOND_END]] +// CHECK3: omp.precond.end: +// CHECK3-NEXT: ret void // // -// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l31 -// CHECK11-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0:[0-9]+]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK11-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK11-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 -// CHECK11-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK11-NEXT: [[ARGC_CASTED:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK11-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK11-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 -// CHECK11-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 -// CHECK11-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 -// CHECK11-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 -// CHECK11-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK11-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 -// CHECK11-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 4 -// CHECK11-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 -// CHECK11-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK11-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK11-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK11: .execute: -// CHECK11-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) -// CHECK11-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 -// CHECK11-NEXT: store i32 [[TMP5]], i32* [[ARGC_CASTED]], align 4 -// CHECK11-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARGC_CASTED]], align 4 -// CHECK11-NEXT: store i32 [[TMP4]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK11-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]], [10 x i32]* [[TMP1]], i32* [[TMP2]], i32 [[TMP6]], [10 x i32]* [[TMP3]]) #[[ATTR4:[0-9]+]] -// CHECK11-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK11: .omp.deinit: -// CHECK11-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK11-NEXT: br label [[DOTEXIT:%.*]] -// CHECK11: .exit: -// CHECK11-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25 +// CHECK4-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i64 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK4-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[ARGC_ADDR:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK4-NEXT: [[ARGC_CASTED:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK4-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK4-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 8 +// CHECK4-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK4-NEXT: store i64 [[ARGC]], i64* [[ARGC_ADDR]], align 8 +// CHECK4-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 8 +// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK4-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 8 +// CHECK4-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK4-NEXT: [[CONV:%.*]] = bitcast i64* [[ARGC_ADDR]] to i32* +// CHECK4-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 8 +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK4: .execute: +// CHECK4-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK4-NEXT: [[CONV1:%.*]] = bitcast i64* [[ARGC_CASTED]] to i32* +// CHECK4-NEXT: store i32 [[TMP5]], i32* [[CONV1]], align 4 +// CHECK4-NEXT: [[TMP6:%.*]] = load i64, i64* [[ARGC_CASTED]], align 8 +// CHECK4-NEXT: store i32 [[TMP4]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK4-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]], [10 x i32]* [[TMP1]], i32* [[TMP2]], i64 [[TMP6]], [10 x i32]* [[TMP3]]) #[[ATTR4:[0-9]+]] +// CHECK4-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK4: .omp.deinit: +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK4-NEXT: br label [[DOTEXIT:%.*]] +// CHECK4: .exit: +// CHECK4-NEXT: ret void // // -// CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK11-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK11-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK11-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 -// CHECK11-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[B4:%.*]] = alloca [10 x i32], align 4 -// CHECK11-NEXT: [[I5:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [7 x i8*], align 4 -// CHECK11-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK11-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK11-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK11-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 -// CHECK11-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 -// CHECK11-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 -// CHECK11-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 -// CHECK11-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK11-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 -// CHECK11-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 4 -// CHECK11-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 -// CHECK11-NEXT: [[TMP4:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 40, i16 1) -// CHECK11-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to %struct._globalized_locals_ty* -// CHECK11-NEXT: [[C1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP5]], i32 0, i32 0 -// CHECK11-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 -// CHECK11-NEXT: store i32 [[TMP6]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK11-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK11-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP7]], 0 -// CHECK11-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK11-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK11-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK11-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK11-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK11-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP8]] -// CHECK11-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK11: omp.precond.then: -// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK11-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK11-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK11-NEXT: [[TMP10:%.*]] = bitcast [10 x i32]* [[B4]] to i8* -// CHECK11-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* -// CHECK11-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP10]], i8* align 4 [[TMP11]], i32 40, i1 false) -// CHECK11-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK11-NEXT: [[TMP12:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK11-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4 -// CHECK11-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP13]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK11-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK11-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]] -// CHECK11-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK11: cond.true: -// CHECK11-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK11-NEXT: br label [[COND_END:%.*]] -// CHECK11: cond.false: -// CHECK11-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: br label [[COND_END]] -// CHECK11: cond.end: -// CHECK11-NEXT: [[COND:%.*]] = phi i32 [ [[TMP16]], [[COND_TRUE]] ], [ [[TMP17]], [[COND_FALSE]] ] -// CHECK11-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK11-NEXT: store i32 [[TMP18]], i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK11: omp.inner.for.cond: -// CHECK11-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], 1 -// CHECK11-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP19]], [[ADD]] -// CHECK11-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK11: omp.inner.for.body: -// CHECK11-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK11-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: [[TMP23:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK11-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP21]] to i8* -// CHECK11-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 -// CHECK11-NEXT: [[TMP25:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK11-NEXT: [[TMP26:%.*]] = inttoptr i32 [[TMP22]] to i8* -// CHECK11-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 -// CHECK11-NEXT: [[TMP27:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK11-NEXT: [[TMP28:%.*]] = bitcast i32* [[ARGC_ADDR]] to i8* -// CHECK11-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 -// CHECK11-NEXT: [[TMP29:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK11-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP2]] to i8* -// CHECK11-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 4 -// CHECK11-NEXT: [[TMP31:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 -// CHECK11-NEXT: [[TMP32:%.*]] = bitcast [10 x i32]* [[B4]] to i8* -// CHECK11-NEXT: store i8* [[TMP32]], i8** [[TMP31]], align 4 -// CHECK11-NEXT: [[TMP33:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 5 -// CHECK11-NEXT: [[TMP34:%.*]] = bitcast [10 x i32]* [[C1]] to i8* -// CHECK11-NEXT: store i8* [[TMP34]], i8** [[TMP33]], align 4 -// CHECK11-NEXT: [[TMP35:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 6 -// CHECK11-NEXT: [[TMP36:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* -// CHECK11-NEXT: store i8* [[TMP36]], i8** [[TMP35]], align 4 -// CHECK11-NEXT: [[TMP37:%.*]] = load i32, i32* [[TMP2]], align 4 -// CHECK11-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP37]], 0 -// CHECK11-NEXT: [[TMP38:%.*]] = zext i1 [[TOBOOL]] to i32 -// CHECK11-NEXT: [[TMP39:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK11-NEXT: [[TMP40:%.*]] = load i32, i32* [[TMP39]], align 4 -// CHECK11-NEXT: [[TMP41:%.*]] = bitcast [7 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK11-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP40]], i32 [[TMP38]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32*, i32*, [10 x i32]*, [10 x i32]*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP41]], i32 7) -// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK11: omp.inner.for.inc: -// CHECK11-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK11-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP42]], [[TMP43]] -// CHECK11-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK11-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK11-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP44]], [[TMP45]] -// CHECK11-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK11-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK11-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP46]], [[TMP47]] -// CHECK11-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: [[TMP49:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK11-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[TMP48]], [[TMP49]] -// CHECK11-NEXT: br i1 [[CMP11]], label [[COND_TRUE12:%.*]], label [[COND_FALSE13:%.*]] -// CHECK11: cond.true12: -// CHECK11-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK11-NEXT: br label [[COND_END14:%.*]] -// CHECK11: cond.false13: -// CHECK11-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: br label [[COND_END14]] -// CHECK11: cond.end14: -// CHECK11-NEXT: [[COND15:%.*]] = phi i32 [ [[TMP50]], [[COND_TRUE12]] ], [ [[TMP51]], [[COND_FALSE13]] ] -// CHECK11-NEXT: store i32 [[COND15]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: [[TMP52:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK11-NEXT: store i32 [[TMP52]], i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK11: omp.inner.for.end: -// CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK11: omp.loop.exit: -// CHECK11-NEXT: [[TMP53:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK11-NEXT: [[TMP54:%.*]] = load i32, i32* [[TMP53]], align 4 -// CHECK11-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP54]]) -// CHECK11-NEXT: [[TMP55:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK11-NEXT: [[TMP56:%.*]] = icmp ne i32 [[TMP55]], 0 -// CHECK11-NEXT: br i1 [[TMP56]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK11: .omp.lastprivate.then: -// CHECK11-NEXT: [[TMP57:%.*]] = bitcast [10 x i32]* [[TMP1]] to i8* -// CHECK11-NEXT: [[TMP58:%.*]] = bitcast [10 x i32]* [[C1]] to i8* -// CHECK11-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP57]], i8* align 4 [[TMP58]], i32 40, i1 false) -// CHECK11-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK11: .omp.lastprivate.done: -// CHECK11-NEXT: br label [[OMP_PRECOND_END]] -// CHECK11: omp.precond.end: -// CHECK11-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP4]]) -// CHECK11-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i64 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK4-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[ARGC_ADDR:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[B4:%.*]] = alloca [10 x i32], align 4 +// CHECK4-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [7 x i8*], align 8 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK4-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK4-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 8 +// CHECK4-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK4-NEXT: store i64 [[ARGC]], i64* [[ARGC_ADDR]], align 8 +// CHECK4-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 8 +// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK4-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 8 +// CHECK4-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK4-NEXT: [[CONV:%.*]] = bitcast i64* [[ARGC_ADDR]] to i32* +// CHECK4-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 8 +// CHECK4-NEXT: [[C1:%.*]] = call i8* @__kmpc_alloc_shared(i64 40) +// CHECK4-NEXT: [[C_ON_STACK:%.*]] = bitcast i8* [[C1]] to [10 x i32]* +// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK4-NEXT: store i32 [[TMP4]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP5]], 0 +// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK4-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK4-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK4: omp.precond.then: +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: store i32 [[TMP7]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[TMP8:%.*]] = bitcast [10 x i32]* [[B4]] to i8* +// CHECK4-NEXT: [[TMP9:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK4-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP8]], i8* align 4 [[TMP9]], i64 40, i1 false) +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK4-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP11]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]] +// CHECK4-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK4: cond.true: +// CHECK4-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: br label [[COND_END:%.*]] +// CHECK4: cond.false: +// CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: br label [[COND_END]] +// CHECK4: cond.end: +// CHECK4-NEXT: [[COND:%.*]] = phi i32 [ [[TMP14]], [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ] +// CHECK4-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP16]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK4: omp.inner.for.cond: +// CHECK4-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], 1 +// CHECK4-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP17]], [[ADD]] +// CHECK4-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK4: omp.inner.for.body: +// CHECK4-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP20:%.*]] = zext i32 [[TMP19]] to i64 +// CHECK4-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP22:%.*]] = zext i32 [[TMP21]] to i64 +// CHECK4-NEXT: [[TMP23:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK4-NEXT: [[TMP24:%.*]] = inttoptr i64 [[TMP20]] to i8* +// CHECK4-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 8 +// CHECK4-NEXT: [[TMP25:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK4-NEXT: [[TMP26:%.*]] = inttoptr i64 [[TMP22]] to i8* +// CHECK4-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 8 +// CHECK4-NEXT: [[TMP27:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK4-NEXT: [[TMP28:%.*]] = bitcast i32* [[CONV]] to i8* +// CHECK4-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 8 +// CHECK4-NEXT: [[TMP29:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK4-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP2]] to i8* +// CHECK4-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 8 +// CHECK4-NEXT: [[TMP31:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 4 +// CHECK4-NEXT: [[TMP32:%.*]] = bitcast [10 x i32]* [[B4]] to i8* +// CHECK4-NEXT: store i8* [[TMP32]], i8** [[TMP31]], align 8 +// CHECK4-NEXT: [[TMP33:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 5 +// CHECK4-NEXT: [[TMP34:%.*]] = bitcast [10 x i32]* [[C_ON_STACK]] to i8* +// CHECK4-NEXT: store i8* [[TMP34]], i8** [[TMP33]], align 8 +// CHECK4-NEXT: [[TMP35:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 6 +// CHECK4-NEXT: [[TMP36:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* +// CHECK4-NEXT: store i8* [[TMP36]], i8** [[TMP35]], align 8 +// CHECK4-NEXT: [[TMP37:%.*]] = load i32, i32* [[TMP2]], align 4 +// CHECK4-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP37]], 0 +// CHECK4-NEXT: [[TMP38:%.*]] = zext i1 [[TOBOOL]] to i32 +// CHECK4-NEXT: [[TMP39:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK4-NEXT: [[TMP40:%.*]] = load i32, i32* [[TMP39]], align 4 +// CHECK4-NEXT: [[TMP41:%.*]] = bitcast [7 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP40]], i32 [[TMP38]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i32*, i32*, [10 x i32]*, [10 x i32]*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP41]], i64 7) +// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK4: omp.inner.for.inc: +// CHECK4-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP42]], [[TMP43]] +// CHECK4-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP44]], [[TMP45]] +// CHECK4-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP46]], [[TMP47]] +// CHECK4-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP49:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[TMP48]], [[TMP49]] +// CHECK4-NEXT: br i1 [[CMP11]], label [[COND_TRUE12:%.*]], label [[COND_FALSE13:%.*]] +// CHECK4: cond.true12: +// CHECK4-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: br label [[COND_END14:%.*]] +// CHECK4: cond.false13: +// CHECK4-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: br label [[COND_END14]] +// CHECK4: cond.end14: +// CHECK4-NEXT: [[COND15:%.*]] = phi i32 [ [[TMP50]], [[COND_TRUE12]] ], [ [[TMP51]], [[COND_FALSE13]] ] +// CHECK4-NEXT: store i32 [[COND15]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP52:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP52]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK4: omp.inner.for.end: +// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK4: omp.loop.exit: +// CHECK4-NEXT: [[TMP53:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK4-NEXT: [[TMP54:%.*]] = load i32, i32* [[TMP53]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP54]]) +// CHECK4-NEXT: [[TMP55:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[TMP56:%.*]] = icmp ne i32 [[TMP55]], 0 +// CHECK4-NEXT: br i1 [[TMP56]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK4: .omp.lastprivate.then: +// CHECK4-NEXT: [[TMP57:%.*]] = bitcast [10 x i32]* [[TMP1]] to i8* +// CHECK4-NEXT: [[TMP58:%.*]] = bitcast [10 x i32]* [[C_ON_STACK]] to i8* +// CHECK4-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP57]], i8* align 4 [[TMP58]], i64 40, i1 false) +// CHECK4-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK4: .omp.lastprivate.done: +// CHECK4-NEXT: br label [[OMP_PRECOND_END]] +// CHECK4: omp.precond.end: +// CHECK4-NEXT: call void @__kmpc_free_shared(i8* [[C1]]) +// CHECK4-NEXT: ret void // // -// CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK11-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 4 -// CHECK11-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 -// CHECK11-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK11-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK11-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[B3:%.*]] = alloca [10 x i32], align 4 -// CHECK11-NEXT: [[C4:%.*]] = alloca [10 x i32], align 4 -// CHECK11-NEXT: [[I5:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK11-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK11-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK11-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK11-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 4 -// CHECK11-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 -// CHECK11-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK11-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 -// CHECK11-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 -// CHECK11-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 4 -// CHECK11-NEXT: [[TMP1:%.*]] = load i32*, i32** [[A_ADDR]], align 4 -// CHECK11-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK11-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 -// CHECK11-NEXT: [[TMP4:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 -// CHECK11-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP0]], align 4 -// CHECK11-NEXT: store i32 [[TMP5]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK11-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK11-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP6]], 0 -// CHECK11-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK11-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK11-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK11-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK11-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK11-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP7]] -// CHECK11-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK11: omp.precond.then: -// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK11-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK11-NEXT: store i32 [[TMP8]], i32* [[DOTOMP_UB]], align 4 -// CHECK11-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK11-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK11-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_LB]], align 4 -// CHECK11-NEXT: store i32 [[TMP10]], i32* [[DOTOMP_UB]], align 4 -// CHECK11-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK11-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[B3]] to i8* -// CHECK11-NEXT: [[TMP12:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8* -// CHECK11-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP11]], i8* align 4 [[TMP12]], i32 40, i1 false) -// CHECK11-NEXT: [[TMP13:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK11-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4 -// CHECK11-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP14]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK11-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK11-NEXT: store i32 [[TMP15]], i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK11: omp.inner.for.cond: -// CHECK11-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK11-NEXT: [[CMP6:%.*]] = icmp ule i32 [[TMP16]], [[TMP17]] -// CHECK11-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK11: omp.inner.for.body: -// CHECK11-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 -// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK11-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 -// CHECK11-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I5]]) #[[ATTR5:[0-9]+]] -// CHECK11-NEXT: [[CALL7:%.*]] = call i32 @_Z3fooPi(i32* [[TMP1]]) #[[ATTR5]] -// CHECK11-NEXT: [[ADD8:%.*]] = add nsw i32 [[CALL]], [[CALL7]] -// CHECK11-NEXT: [[TMP19:%.*]] = load i32, i32* [[I5]], align 4 -// CHECK11-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B3]], i32 0, i32 [[TMP19]] -// CHECK11-NEXT: [[CALL9:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX]]) #[[ATTR5]] -// CHECK11-NEXT: [[ADD10:%.*]] = add nsw i32 [[ADD8]], [[CALL9]] -// CHECK11-NEXT: [[TMP20:%.*]] = load i32, i32* [[I5]], align 4 -// CHECK11-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[C4]], i32 0, i32 [[TMP20]] -// CHECK11-NEXT: [[CALL12:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX11]]) #[[ATTR5]] -// CHECK11-NEXT: [[ADD13:%.*]] = add nsw i32 [[ADD10]], [[CALL12]] -// CHECK11-NEXT: [[TMP21:%.*]] = load i32, i32* [[I5]], align 4 -// CHECK11-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP4]], i32 0, i32 [[TMP21]] -// CHECK11-NEXT: [[CALL15:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX14]]) #[[ATTR5]] -// CHECK11-NEXT: [[ADD16:%.*]] = add nsw i32 [[ADD13]], [[CALL15]] -// CHECK11-NEXT: store i32 [[ADD16]], i32* [[TMP1]], align 4 -// CHECK11-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK11: omp.body.continue: -// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK11: omp.inner.for.inc: -// CHECK11-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK11-NEXT: [[ADD17:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK11-NEXT: store i32 [[ADD17]], i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK11: omp.inner.for.end: -// CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK11: omp.loop.exit: -// CHECK11-NEXT: [[TMP24:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK11-NEXT: [[TMP25:%.*]] = load i32, i32* [[TMP24]], align 4 -// CHECK11-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP25]]) -// CHECK11-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK11-NEXT: [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0 -// CHECK11-NEXT: br i1 [[TMP27]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK11: .omp.lastprivate.then: -// CHECK11-NEXT: [[TMP28:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* -// CHECK11-NEXT: [[TMP29:%.*]] = bitcast [10 x i32]* [[C4]] to i8* -// CHECK11-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP28]], i8* align 4 [[TMP29]], i32 40, i1 false) -// CHECK11-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK11: .omp.lastprivate.done: -// CHECK11-NEXT: br label [[OMP_PRECOND_END]] -// CHECK11: omp.precond.end: -// CHECK11-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK4-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK4-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[B4:%.*]] = alloca [10 x i32], align 4 +// CHECK4-NEXT: [[C5:%.*]] = alloca [10 x i32], align 4 +// CHECK4-NEXT: [[I6:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK4-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK4-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK4-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 8 +// CHECK4-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK4-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK4-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 8 +// CHECK4-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 8 +// CHECK4-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 8 +// CHECK4-NEXT: [[TMP1:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK4-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK4-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 8 +// CHECK4-NEXT: [[TMP4:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 8 +// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK4-NEXT: store i32 [[TMP5]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP6]], 0 +// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK4-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK4-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP7]] +// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK4: omp.precond.then: +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: store i32 [[TMP8]], i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: [[TMP9:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK4-NEXT: [[CONV:%.*]] = trunc i64 [[TMP9]] to i32 +// CHECK4-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK4-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP10]] to i32 +// CHECK4-NEXT: store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[B4]] to i8* +// CHECK4-NEXT: [[TMP12:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8* +// CHECK4-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP11]], i8* align 4 [[TMP12]], i64 40, i1 false) +// CHECK4-NEXT: [[TMP13:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK4-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP14]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP15]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK4: omp.inner.for.cond: +// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[CONV7:%.*]] = sext i32 [[TMP16]] to i64 +// CHECK4-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK4-NEXT: [[CMP8:%.*]] = icmp ule i64 [[CONV7]], [[TMP17]] +// CHECK4-NEXT: br i1 [[CMP8]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK4: omp.inner.for.body: +// CHECK4-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK4-NEXT: store i32 [[ADD]], i32* [[I6]], align 4 +// CHECK4-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I6]]) #[[ATTR5:[0-9]+]] +// CHECK4-NEXT: [[CALL9:%.*]] = call i32 @_Z3fooPi(i32* [[TMP1]]) #[[ATTR5]] +// CHECK4-NEXT: [[ADD10:%.*]] = add nsw i32 [[CALL]], [[CALL9]] +// CHECK4-NEXT: [[TMP19:%.*]] = load i32, i32* [[I6]], align 4 +// CHECK4-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64 +// CHECK4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B4]], i64 0, i64 [[IDXPROM]] +// CHECK4-NEXT: [[CALL11:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX]]) #[[ATTR5]] +// CHECK4-NEXT: [[ADD12:%.*]] = add nsw i32 [[ADD10]], [[CALL11]] +// CHECK4-NEXT: [[TMP20:%.*]] = load i32, i32* [[I6]], align 4 +// CHECK4-NEXT: [[IDXPROM13:%.*]] = sext i32 [[TMP20]] to i64 +// CHECK4-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[C5]], i64 0, i64 [[IDXPROM13]] +// CHECK4-NEXT: [[CALL15:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX14]]) #[[ATTR5]] +// CHECK4-NEXT: [[ADD16:%.*]] = add nsw i32 [[ADD12]], [[CALL15]] +// CHECK4-NEXT: [[TMP21:%.*]] = load i32, i32* [[I6]], align 4 +// CHECK4-NEXT: [[IDXPROM17:%.*]] = sext i32 [[TMP21]] to i64 +// CHECK4-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP4]], i64 0, i64 [[IDXPROM17]] +// CHECK4-NEXT: [[CALL19:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX18]]) #[[ATTR5]] +// CHECK4-NEXT: [[ADD20:%.*]] = add nsw i32 [[ADD16]], [[CALL19]] +// CHECK4-NEXT: store i32 [[ADD20]], i32* [[TMP1]], align 4 +// CHECK4-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK4: omp.body.continue: +// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK4: omp.inner.for.inc: +// CHECK4-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD21:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK4-NEXT: store i32 [[ADD21]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK4: omp.inner.for.end: +// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK4: omp.loop.exit: +// CHECK4-NEXT: [[TMP24:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK4-NEXT: [[TMP25:%.*]] = load i32, i32* [[TMP24]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP25]]) +// CHECK4-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0 +// CHECK4-NEXT: br i1 [[TMP27]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK4: .omp.lastprivate.then: +// CHECK4-NEXT: [[TMP28:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* +// CHECK4-NEXT: [[TMP29:%.*]] = bitcast [10 x i32]* [[C5]] to i8* +// CHECK4-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP28]], i8* align 4 [[TMP29]], i64 40, i1 false) +// CHECK4-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK4: .omp.lastprivate.done: +// CHECK4-NEXT: br label [[OMP_PRECOND_END]] +// CHECK4: omp.precond.end: +// CHECK4-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25 +// CHECK5-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK5-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK5-NEXT: [[ARGC_CASTED:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK5-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK5-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 +// CHECK5-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 +// CHECK5-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK5-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 +// CHECK5-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK5-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK5: .execute: +// CHECK5-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP5]], i32* [[ARGC_CASTED]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARGC_CASTED]], align 4 +// CHECK5-NEXT: store i32 [[TMP4]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK5-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]], [10 x i32]* [[TMP1]], i32* [[TMP2]], i32 [[TMP6]], [10 x i32]* [[TMP3]]) #[[ATTR4:[0-9]+]] +// CHECK5-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK5: .omp.deinit: +// CHECK5-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK5-NEXT: br label [[DOTEXIT:%.*]] +// CHECK5: .exit: +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK5-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[B4:%.*]] = alloca [10 x i32], align 4 +// CHECK5-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [7 x i8*], align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK5-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK5-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 +// CHECK5-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 +// CHECK5-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK5-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 +// CHECK5-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 +// CHECK5-NEXT: [[C1:%.*]] = call i8* @__kmpc_alloc_shared(i32 40) +// CHECK5-NEXT: [[C_ON_STACK:%.*]] = bitcast i8* [[C1]] to [10 x i32]* +// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP4]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP5]], 0 +// CHECK5-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK5-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK5-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK5-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK5: omp.precond.then: +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK5-NEXT: store i32 [[TMP7]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = bitcast [10 x i32]* [[B4]] to i8* +// CHECK5-NEXT: [[TMP9:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK5-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP8]], i8* align 4 [[TMP9]], i32 40, i1 false) +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP11]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK5-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]] +// CHECK5-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK5: cond.true: +// CHECK5-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK5-NEXT: br label [[COND_END:%.*]] +// CHECK5: cond.false: +// CHECK5-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: br label [[COND_END]] +// CHECK5: cond.end: +// CHECK5-NEXT: [[COND:%.*]] = phi i32 [ [[TMP14]], [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ] +// CHECK5-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP16]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK5: omp.inner.for.cond: +// CHECK5-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], 1 +// CHECK5-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP17]], [[ADD]] +// CHECK5-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK5: omp.inner.for.body: +// CHECK5-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP21:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP22:%.*]] = inttoptr i32 [[TMP19]] to i8* +// CHECK5-NEXT: store i8* [[TMP22]], i8** [[TMP21]], align 4 +// CHECK5-NEXT: [[TMP23:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP20]] to i8* +// CHECK5-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 +// CHECK5-NEXT: [[TMP25:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK5-NEXT: [[TMP26:%.*]] = bitcast i32* [[ARGC_ADDR]] to i8* +// CHECK5-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 +// CHECK5-NEXT: [[TMP27:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK5-NEXT: [[TMP28:%.*]] = bitcast i32* [[TMP2]] to i8* +// CHECK5-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 +// CHECK5-NEXT: [[TMP29:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK5-NEXT: [[TMP30:%.*]] = bitcast [10 x i32]* [[B4]] to i8* +// CHECK5-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 4 +// CHECK5-NEXT: [[TMP31:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 5 +// CHECK5-NEXT: [[TMP32:%.*]] = bitcast [10 x i32]* [[C_ON_STACK]] to i8* +// CHECK5-NEXT: store i8* [[TMP32]], i8** [[TMP31]], align 4 +// CHECK5-NEXT: [[TMP33:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 6 +// CHECK5-NEXT: [[TMP34:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* +// CHECK5-NEXT: store i8* [[TMP34]], i8** [[TMP33]], align 4 +// CHECK5-NEXT: [[TMP35:%.*]] = load i32, i32* [[TMP2]], align 4 +// CHECK5-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP35]], 0 +// CHECK5-NEXT: [[TMP36:%.*]] = zext i1 [[TOBOOL]] to i32 +// CHECK5-NEXT: [[TMP37:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP38:%.*]] = load i32, i32* [[TMP37]], align 4 +// CHECK5-NEXT: [[TMP39:%.*]] = bitcast [7 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP38]], i32 [[TMP36]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32*, i32*, [10 x i32]*, [10 x i32]*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP39]], i32 7) +// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK5: omp.inner.for.inc: +// CHECK5-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP40]], [[TMP41]] +// CHECK5-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP42]], [[TMP43]] +// CHECK5-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP44]], [[TMP45]] +// CHECK5-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK5-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[TMP46]], [[TMP47]] +// CHECK5-NEXT: br i1 [[CMP11]], label [[COND_TRUE12:%.*]], label [[COND_FALSE13:%.*]] +// CHECK5: cond.true12: +// CHECK5-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK5-NEXT: br label [[COND_END14:%.*]] +// CHECK5: cond.false13: +// CHECK5-NEXT: [[TMP49:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: br label [[COND_END14]] +// CHECK5: cond.end14: +// CHECK5-NEXT: [[COND15:%.*]] = phi i32 [ [[TMP48]], [[COND_TRUE12]] ], [ [[TMP49]], [[COND_FALSE13]] ] +// CHECK5-NEXT: store i32 [[COND15]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP50]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK5: omp.inner.for.end: +// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK5: omp.loop.exit: +// CHECK5-NEXT: [[TMP51:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP52:%.*]] = load i32, i32* [[TMP51]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP52]]) +// CHECK5-NEXT: [[TMP53:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[TMP54:%.*]] = icmp ne i32 [[TMP53]], 0 +// CHECK5-NEXT: br i1 [[TMP54]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK5: .omp.lastprivate.then: +// CHECK5-NEXT: [[TMP55:%.*]] = bitcast [10 x i32]* [[TMP1]] to i8* +// CHECK5-NEXT: [[TMP56:%.*]] = bitcast [10 x i32]* [[C_ON_STACK]] to i8* +// CHECK5-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP55]], i8* align 4 [[TMP56]], i32 40, i1 false) +// CHECK5-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK5: .omp.lastprivate.done: +// CHECK5-NEXT: br label [[OMP_PRECOND_END]] +// CHECK5: omp.precond.end: +// CHECK5-NEXT: call void @__kmpc_free_shared(i8* [[C1]]) +// CHECK5-NEXT: ret void // // -// CHECK12-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l31 -// CHECK12-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0:[0-9]+]] { -// CHECK12-NEXT: entry: -// CHECK12-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK12-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK12-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 -// CHECK12-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK12-NEXT: [[ARGC_CASTED:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK12-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK12-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 -// CHECK12-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 -// CHECK12-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 -// CHECK12-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 -// CHECK12-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK12-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 -// CHECK12-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 4 -// CHECK12-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 -// CHECK12-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK12-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK12-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK12: .execute: -// CHECK12-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) -// CHECK12-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 -// CHECK12-NEXT: store i32 [[TMP5]], i32* [[ARGC_CASTED]], align 4 -// CHECK12-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARGC_CASTED]], align 4 -// CHECK12-NEXT: store i32 [[TMP4]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK12-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]], [10 x i32]* [[TMP1]], i32* [[TMP2]], i32 [[TMP6]], [10 x i32]* [[TMP3]]) #[[ATTR4:[0-9]+]] -// CHECK12-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK12: .omp.deinit: -// CHECK12-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK12-NEXT: br label [[DOTEXIT:%.*]] -// CHECK12: .exit: -// CHECK12-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK5-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK5-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[B3:%.*]] = alloca [10 x i32], align 4 +// CHECK5-NEXT: [[C4:%.*]] = alloca [10 x i32], align 4 +// CHECK5-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 4 +// CHECK5-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK5-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK5-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 +// CHECK5-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 4 +// CHECK5-NEXT: [[TMP1:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK5-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 +// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK5-NEXT: store i32 [[TMP5]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP6]], 0 +// CHECK5-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK5-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK5-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP7]] +// CHECK5-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK5: omp.precond.then: +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK5-NEXT: store i32 [[TMP8]], i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP10]], i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[B3]] to i8* +// CHECK5-NEXT: [[TMP12:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8* +// CHECK5-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP11]], i8* align 4 [[TMP12]], i32 40, i1 false) +// CHECK5-NEXT: [[TMP13:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP14]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK5-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP15]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK5: omp.inner.for.cond: +// CHECK5-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK5-NEXT: [[CMP6:%.*]] = icmp ule i32 [[TMP16]], [[TMP17]] +// CHECK5-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK5: omp.inner.for.body: +// CHECK5-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK5-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 +// CHECK5-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I5]]) #[[ATTR5:[0-9]+]] +// CHECK5-NEXT: [[CALL7:%.*]] = call i32 @_Z3fooPi(i32* [[TMP1]]) #[[ATTR5]] +// CHECK5-NEXT: [[ADD8:%.*]] = add nsw i32 [[CALL]], [[CALL7]] +// CHECK5-NEXT: [[TMP19:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK5-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B3]], i32 0, i32 [[TMP19]] +// CHECK5-NEXT: [[CALL9:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX]]) #[[ATTR5]] +// CHECK5-NEXT: [[ADD10:%.*]] = add nsw i32 [[ADD8]], [[CALL9]] +// CHECK5-NEXT: [[TMP20:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK5-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[C4]], i32 0, i32 [[TMP20]] +// CHECK5-NEXT: [[CALL12:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX11]]) #[[ATTR5]] +// CHECK5-NEXT: [[ADD13:%.*]] = add nsw i32 [[ADD10]], [[CALL12]] +// CHECK5-NEXT: [[TMP21:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK5-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP4]], i32 0, i32 [[TMP21]] +// CHECK5-NEXT: [[CALL15:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX14]]) #[[ATTR5]] +// CHECK5-NEXT: [[ADD16:%.*]] = add nsw i32 [[ADD13]], [[CALL15]] +// CHECK5-NEXT: store i32 [[ADD16]], i32* [[TMP1]], align 4 +// CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK5: omp.body.continue: +// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK5: omp.inner.for.inc: +// CHECK5-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD17:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK5-NEXT: store i32 [[ADD17]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK5: omp.inner.for.end: +// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK5: omp.loop.exit: +// CHECK5-NEXT: [[TMP24:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP25:%.*]] = load i32, i32* [[TMP24]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP25]]) +// CHECK5-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0 +// CHECK5-NEXT: br i1 [[TMP27]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK5: .omp.lastprivate.then: +// CHECK5-NEXT: [[TMP28:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* +// CHECK5-NEXT: [[TMP29:%.*]] = bitcast [10 x i32]* [[C4]] to i8* +// CHECK5-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP28]], i8* align 4 [[TMP29]], i32 40, i1 false) +// CHECK5-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK5: .omp.lastprivate.done: +// CHECK5-NEXT: br label [[OMP_PRECOND_END]] +// CHECK5: omp.precond.end: +// CHECK5-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25 +// CHECK6-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK6-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK6-NEXT: [[ARGC_CASTED:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK6-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK6-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 +// CHECK6-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 +// CHECK6-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK6-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 +// CHECK6-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 +// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK6-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK6: .execute: +// CHECK6-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP5]], i32* [[ARGC_CASTED]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARGC_CASTED]], align 4 +// CHECK6-NEXT: store i32 [[TMP4]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK6-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]], [10 x i32]* [[TMP1]], i32* [[TMP2]], i32 [[TMP6]], [10 x i32]* [[TMP3]]) #[[ATTR4:[0-9]+]] +// CHECK6-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK6: .omp.deinit: +// CHECK6-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK6-NEXT: br label [[DOTEXIT:%.*]] +// CHECK6: .exit: +// CHECK6-NEXT: ret void // // -// CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { -// CHECK12-NEXT: entry: -// CHECK12-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK12-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK12-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK12-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK12-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 -// CHECK12-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK12-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[B4:%.*]] = alloca [10 x i32], align 4 -// CHECK12-NEXT: [[I5:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [7 x i8*], align 4 -// CHECK12-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK12-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK12-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK12-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 -// CHECK12-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 -// CHECK12-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 -// CHECK12-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 -// CHECK12-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK12-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 -// CHECK12-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 4 -// CHECK12-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 -// CHECK12-NEXT: [[TMP4:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 40, i16 1) -// CHECK12-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to %struct._globalized_locals_ty* -// CHECK12-NEXT: [[C1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP5]], i32 0, i32 0 -// CHECK12-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 -// CHECK12-NEXT: store i32 [[TMP6]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK12-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK12-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP7]], 0 -// CHECK12-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK12-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK12-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK12-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK12-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK12-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP8]] -// CHECK12-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK12: omp.precond.then: -// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK12-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK12-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK12-NEXT: [[TMP10:%.*]] = bitcast [10 x i32]* [[B4]] to i8* -// CHECK12-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* -// CHECK12-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP10]], i8* align 4 [[TMP11]], i32 40, i1 false) -// CHECK12-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK12-NEXT: [[TMP12:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK12-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4 -// CHECK12-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP13]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK12-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK12-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]] -// CHECK12-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK12: cond.true: -// CHECK12-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK12-NEXT: br label [[COND_END:%.*]] -// CHECK12: cond.false: -// CHECK12-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: br label [[COND_END]] -// CHECK12: cond.end: -// CHECK12-NEXT: [[COND:%.*]] = phi i32 [ [[TMP16]], [[COND_TRUE]] ], [ [[TMP17]], [[COND_FALSE]] ] -// CHECK12-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK12-NEXT: store i32 [[TMP18]], i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK12: omp.inner.for.cond: -// CHECK12-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK12-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], 1 -// CHECK12-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP19]], [[ADD]] -// CHECK12-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK12: omp.inner.for.body: -// CHECK12-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK12-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: [[TMP23:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK12-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP21]] to i8* -// CHECK12-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 -// CHECK12-NEXT: [[TMP25:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK12-NEXT: [[TMP26:%.*]] = inttoptr i32 [[TMP22]] to i8* -// CHECK12-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 -// CHECK12-NEXT: [[TMP27:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK12-NEXT: [[TMP28:%.*]] = bitcast i32* [[ARGC_ADDR]] to i8* -// CHECK12-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 -// CHECK12-NEXT: [[TMP29:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK12-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP2]] to i8* -// CHECK12-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 4 -// CHECK12-NEXT: [[TMP31:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 -// CHECK12-NEXT: [[TMP32:%.*]] = bitcast [10 x i32]* [[B4]] to i8* -// CHECK12-NEXT: store i8* [[TMP32]], i8** [[TMP31]], align 4 -// CHECK12-NEXT: [[TMP33:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 5 -// CHECK12-NEXT: [[TMP34:%.*]] = bitcast [10 x i32]* [[C1]] to i8* -// CHECK12-NEXT: store i8* [[TMP34]], i8** [[TMP33]], align 4 -// CHECK12-NEXT: [[TMP35:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 6 -// CHECK12-NEXT: [[TMP36:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* -// CHECK12-NEXT: store i8* [[TMP36]], i8** [[TMP35]], align 4 -// CHECK12-NEXT: [[TMP37:%.*]] = load i32, i32* [[TMP2]], align 4 -// CHECK12-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP37]], 0 -// CHECK12-NEXT: [[TMP38:%.*]] = zext i1 [[TOBOOL]] to i32 -// CHECK12-NEXT: [[TMP39:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK12-NEXT: [[TMP40:%.*]] = load i32, i32* [[TMP39]], align 4 -// CHECK12-NEXT: [[TMP41:%.*]] = bitcast [7 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK12-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP40]], i32 [[TMP38]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32*, i32*, [10 x i32]*, [10 x i32]*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP41]], i32 7) -// CHECK12-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK12: omp.inner.for.inc: -// CHECK12-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK12-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP42]], [[TMP43]] -// CHECK12-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK12-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK12-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP44]], [[TMP45]] -// CHECK12-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK12-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK12-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP46]], [[TMP47]] -// CHECK12-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: [[TMP49:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK12-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[TMP48]], [[TMP49]] -// CHECK12-NEXT: br i1 [[CMP11]], label [[COND_TRUE12:%.*]], label [[COND_FALSE13:%.*]] -// CHECK12: cond.true12: -// CHECK12-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK12-NEXT: br label [[COND_END14:%.*]] -// CHECK12: cond.false13: -// CHECK12-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: br label [[COND_END14]] -// CHECK12: cond.end14: -// CHECK12-NEXT: [[COND15:%.*]] = phi i32 [ [[TMP50]], [[COND_TRUE12]] ], [ [[TMP51]], [[COND_FALSE13]] ] -// CHECK12-NEXT: store i32 [[COND15]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: [[TMP52:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK12-NEXT: store i32 [[TMP52]], i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK12: omp.inner.for.end: -// CHECK12-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK12: omp.loop.exit: -// CHECK12-NEXT: [[TMP53:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK12-NEXT: [[TMP54:%.*]] = load i32, i32* [[TMP53]], align 4 -// CHECK12-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP54]]) -// CHECK12-NEXT: [[TMP55:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK12-NEXT: [[TMP56:%.*]] = icmp ne i32 [[TMP55]], 0 -// CHECK12-NEXT: br i1 [[TMP56]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK12: .omp.lastprivate.then: -// CHECK12-NEXT: [[TMP57:%.*]] = bitcast [10 x i32]* [[TMP1]] to i8* -// CHECK12-NEXT: [[TMP58:%.*]] = bitcast [10 x i32]* [[C1]] to i8* -// CHECK12-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP57]], i8* align 4 [[TMP58]], i32 40, i1 false) -// CHECK12-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK12: .omp.lastprivate.done: -// CHECK12-NEXT: br label [[OMP_PRECOND_END]] -// CHECK12: omp.precond.end: -// CHECK12-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP4]]) -// CHECK12-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK6-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[B4:%.*]] = alloca [10 x i32], align 4 +// CHECK6-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [7 x i8*], align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK6-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK6-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 +// CHECK6-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 +// CHECK6-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK6-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 +// CHECK6-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 +// CHECK6-NEXT: [[C1:%.*]] = call i8* @__kmpc_alloc_shared(i32 40) +// CHECK6-NEXT: [[C_ON_STACK:%.*]] = bitcast i8* [[C1]] to [10 x i32]* +// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP4]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP5]], 0 +// CHECK6-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK6-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK6-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK6-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK6: omp.precond.then: +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK6-NEXT: store i32 [[TMP7]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = bitcast [10 x i32]* [[B4]] to i8* +// CHECK6-NEXT: [[TMP9:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK6-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP8]], i8* align 4 [[TMP9]], i32 40, i1 false) +// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP11]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK6-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]] +// CHECK6-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK6: cond.true: +// CHECK6-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK6-NEXT: br label [[COND_END:%.*]] +// CHECK6: cond.false: +// CHECK6-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: br label [[COND_END]] +// CHECK6: cond.end: +// CHECK6-NEXT: [[COND:%.*]] = phi i32 [ [[TMP14]], [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ] +// CHECK6-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP16]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK6: omp.inner.for.cond: +// CHECK6-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], 1 +// CHECK6-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP17]], [[ADD]] +// CHECK6-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK6: omp.inner.for.body: +// CHECK6-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP21:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP22:%.*]] = inttoptr i32 [[TMP19]] to i8* +// CHECK6-NEXT: store i8* [[TMP22]], i8** [[TMP21]], align 4 +// CHECK6-NEXT: [[TMP23:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP20]] to i8* +// CHECK6-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 +// CHECK6-NEXT: [[TMP25:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK6-NEXT: [[TMP26:%.*]] = bitcast i32* [[ARGC_ADDR]] to i8* +// CHECK6-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 +// CHECK6-NEXT: [[TMP27:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK6-NEXT: [[TMP28:%.*]] = bitcast i32* [[TMP2]] to i8* +// CHECK6-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 +// CHECK6-NEXT: [[TMP29:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK6-NEXT: [[TMP30:%.*]] = bitcast [10 x i32]* [[B4]] to i8* +// CHECK6-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 4 +// CHECK6-NEXT: [[TMP31:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 5 +// CHECK6-NEXT: [[TMP32:%.*]] = bitcast [10 x i32]* [[C_ON_STACK]] to i8* +// CHECK6-NEXT: store i8* [[TMP32]], i8** [[TMP31]], align 4 +// CHECK6-NEXT: [[TMP33:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 6 +// CHECK6-NEXT: [[TMP34:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* +// CHECK6-NEXT: store i8* [[TMP34]], i8** [[TMP33]], align 4 +// CHECK6-NEXT: [[TMP35:%.*]] = load i32, i32* [[TMP2]], align 4 +// CHECK6-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP35]], 0 +// CHECK6-NEXT: [[TMP36:%.*]] = zext i1 [[TOBOOL]] to i32 +// CHECK6-NEXT: [[TMP37:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP38:%.*]] = load i32, i32* [[TMP37]], align 4 +// CHECK6-NEXT: [[TMP39:%.*]] = bitcast [7 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP38]], i32 [[TMP36]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32*, i32*, [10 x i32]*, [10 x i32]*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP39]], i32 7) +// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK6: omp.inner.for.inc: +// CHECK6-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP40]], [[TMP41]] +// CHECK6-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP42]], [[TMP43]] +// CHECK6-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP44]], [[TMP45]] +// CHECK6-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK6-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[TMP46]], [[TMP47]] +// CHECK6-NEXT: br i1 [[CMP11]], label [[COND_TRUE12:%.*]], label [[COND_FALSE13:%.*]] +// CHECK6: cond.true12: +// CHECK6-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK6-NEXT: br label [[COND_END14:%.*]] +// CHECK6: cond.false13: +// CHECK6-NEXT: [[TMP49:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: br label [[COND_END14]] +// CHECK6: cond.end14: +// CHECK6-NEXT: [[COND15:%.*]] = phi i32 [ [[TMP48]], [[COND_TRUE12]] ], [ [[TMP49]], [[COND_FALSE13]] ] +// CHECK6-NEXT: store i32 [[COND15]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP50]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK6: omp.inner.for.end: +// CHECK6-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK6: omp.loop.exit: +// CHECK6-NEXT: [[TMP51:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP52:%.*]] = load i32, i32* [[TMP51]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP52]]) +// CHECK6-NEXT: [[TMP53:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[TMP54:%.*]] = icmp ne i32 [[TMP53]], 0 +// CHECK6-NEXT: br i1 [[TMP54]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK6: .omp.lastprivate.then: +// CHECK6-NEXT: [[TMP55:%.*]] = bitcast [10 x i32]* [[TMP1]] to i8* +// CHECK6-NEXT: [[TMP56:%.*]] = bitcast [10 x i32]* [[C_ON_STACK]] to i8* +// CHECK6-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP55]], i8* align 4 [[TMP56]], i32 40, i1 false) +// CHECK6-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK6: .omp.lastprivate.done: +// CHECK6-NEXT: br label [[OMP_PRECOND_END]] +// CHECK6: omp.precond.end: +// CHECK6-NEXT: call void @__kmpc_free_shared(i8* [[C1]]) +// CHECK6-NEXT: ret void // // -// CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { -// CHECK12-NEXT: entry: -// CHECK12-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK12-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK12-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 4 -// CHECK12-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 -// CHECK12-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK12-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK12-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK12-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[B3:%.*]] = alloca [10 x i32], align 4 -// CHECK12-NEXT: [[C4:%.*]] = alloca [10 x i32], align 4 -// CHECK12-NEXT: [[I5:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK12-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK12-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK12-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK12-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 4 -// CHECK12-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 -// CHECK12-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK12-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 -// CHECK12-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 -// CHECK12-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 4 -// CHECK12-NEXT: [[TMP1:%.*]] = load i32*, i32** [[A_ADDR]], align 4 -// CHECK12-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK12-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 -// CHECK12-NEXT: [[TMP4:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 -// CHECK12-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP0]], align 4 -// CHECK12-NEXT: store i32 [[TMP5]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK12-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK12-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP6]], 0 -// CHECK12-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK12-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK12-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK12-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK12-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK12-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP7]] -// CHECK12-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK12: omp.precond.then: -// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK12-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK12-NEXT: store i32 [[TMP8]], i32* [[DOTOMP_UB]], align 4 -// CHECK12-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK12-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK12-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_LB]], align 4 -// CHECK12-NEXT: store i32 [[TMP10]], i32* [[DOTOMP_UB]], align 4 -// CHECK12-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK12-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[B3]] to i8* -// CHECK12-NEXT: [[TMP12:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8* -// CHECK12-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP11]], i8* align 4 [[TMP12]], i32 40, i1 false) -// CHECK12-NEXT: [[TMP13:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK12-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4 -// CHECK12-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP14]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK12-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK12-NEXT: store i32 [[TMP15]], i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK12: omp.inner.for.cond: -// CHECK12-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK12-NEXT: [[CMP6:%.*]] = icmp ule i32 [[TMP16]], [[TMP17]] -// CHECK12-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK12: omp.inner.for.body: -// CHECK12-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 -// CHECK12-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK12-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 -// CHECK12-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I5]]) #[[ATTR5:[0-9]+]] -// CHECK12-NEXT: [[CALL7:%.*]] = call i32 @_Z3fooPi(i32* [[TMP1]]) #[[ATTR5]] -// CHECK12-NEXT: [[ADD8:%.*]] = add nsw i32 [[CALL]], [[CALL7]] -// CHECK12-NEXT: [[TMP19:%.*]] = load i32, i32* [[I5]], align 4 -// CHECK12-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B3]], i32 0, i32 [[TMP19]] -// CHECK12-NEXT: [[CALL9:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX]]) #[[ATTR5]] -// CHECK12-NEXT: [[ADD10:%.*]] = add nsw i32 [[ADD8]], [[CALL9]] -// CHECK12-NEXT: [[TMP20:%.*]] = load i32, i32* [[I5]], align 4 -// CHECK12-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[C4]], i32 0, i32 [[TMP20]] -// CHECK12-NEXT: [[CALL12:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX11]]) #[[ATTR5]] -// CHECK12-NEXT: [[ADD13:%.*]] = add nsw i32 [[ADD10]], [[CALL12]] -// CHECK12-NEXT: [[TMP21:%.*]] = load i32, i32* [[I5]], align 4 -// CHECK12-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP4]], i32 0, i32 [[TMP21]] -// CHECK12-NEXT: [[CALL15:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX14]]) #[[ATTR5]] -// CHECK12-NEXT: [[ADD16:%.*]] = add nsw i32 [[ADD13]], [[CALL15]] -// CHECK12-NEXT: store i32 [[ADD16]], i32* [[TMP1]], align 4 -// CHECK12-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK12: omp.body.continue: -// CHECK12-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK12: omp.inner.for.inc: -// CHECK12-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK12-NEXT: [[ADD17:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK12-NEXT: store i32 [[ADD17]], i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK12: omp.inner.for.end: -// CHECK12-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK12: omp.loop.exit: -// CHECK12-NEXT: [[TMP24:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK12-NEXT: [[TMP25:%.*]] = load i32, i32* [[TMP24]], align 4 -// CHECK12-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP25]]) -// CHECK12-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK12-NEXT: [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0 -// CHECK12-NEXT: br i1 [[TMP27]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK12: .omp.lastprivate.then: -// CHECK12-NEXT: [[TMP28:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* -// CHECK12-NEXT: [[TMP29:%.*]] = bitcast [10 x i32]* [[C4]] to i8* -// CHECK12-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP28]], i8* align 4 [[TMP29]], i32 40, i1 false) -// CHECK12-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK12: .omp.lastprivate.done: -// CHECK12-NEXT: br label [[OMP_PRECOND_END]] -// CHECK12: omp.precond.end: -// CHECK12-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK6-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK6-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[B3:%.*]] = alloca [10 x i32], align 4 +// CHECK6-NEXT: [[C4:%.*]] = alloca [10 x i32], align 4 +// CHECK6-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 4 +// CHECK6-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK6-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK6-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 +// CHECK6-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 4 +// CHECK6-NEXT: [[TMP1:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK6-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 +// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK6-NEXT: store i32 [[TMP5]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP6]], 0 +// CHECK6-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK6-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK6-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP7]] +// CHECK6-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK6: omp.precond.then: +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK6-NEXT: store i32 [[TMP8]], i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP10]], i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[B3]] to i8* +// CHECK6-NEXT: [[TMP12:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8* +// CHECK6-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP11]], i8* align 4 [[TMP12]], i32 40, i1 false) +// CHECK6-NEXT: [[TMP13:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP14]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK6-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP15]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK6: omp.inner.for.cond: +// CHECK6-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK6-NEXT: [[CMP6:%.*]] = icmp ule i32 [[TMP16]], [[TMP17]] +// CHECK6-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK6: omp.inner.for.body: +// CHECK6-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK6-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 +// CHECK6-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I5]]) #[[ATTR5:[0-9]+]] +// CHECK6-NEXT: [[CALL7:%.*]] = call i32 @_Z3fooPi(i32* [[TMP1]]) #[[ATTR5]] +// CHECK6-NEXT: [[ADD8:%.*]] = add nsw i32 [[CALL]], [[CALL7]] +// CHECK6-NEXT: [[TMP19:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK6-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B3]], i32 0, i32 [[TMP19]] +// CHECK6-NEXT: [[CALL9:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX]]) #[[ATTR5]] +// CHECK6-NEXT: [[ADD10:%.*]] = add nsw i32 [[ADD8]], [[CALL9]] +// CHECK6-NEXT: [[TMP20:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK6-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[C4]], i32 0, i32 [[TMP20]] +// CHECK6-NEXT: [[CALL12:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX11]]) #[[ATTR5]] +// CHECK6-NEXT: [[ADD13:%.*]] = add nsw i32 [[ADD10]], [[CALL12]] +// CHECK6-NEXT: [[TMP21:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK6-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP4]], i32 0, i32 [[TMP21]] +// CHECK6-NEXT: [[CALL15:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX14]]) #[[ATTR5]] +// CHECK6-NEXT: [[ADD16:%.*]] = add nsw i32 [[ADD13]], [[CALL15]] +// CHECK6-NEXT: store i32 [[ADD16]], i32* [[TMP1]], align 4 +// CHECK6-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK6: omp.body.continue: +// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK6: omp.inner.for.inc: +// CHECK6-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD17:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK6-NEXT: store i32 [[ADD17]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK6: omp.inner.for.end: +// CHECK6-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK6: omp.loop.exit: +// CHECK6-NEXT: [[TMP24:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP25:%.*]] = load i32, i32* [[TMP24]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP25]]) +// CHECK6-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0 +// CHECK6-NEXT: br i1 [[TMP27]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK6: .omp.lastprivate.then: +// CHECK6-NEXT: [[TMP28:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* +// CHECK6-NEXT: [[TMP29:%.*]] = bitcast [10 x i32]* [[C4]] to i8* +// CHECK6-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP28]], i8* align 4 [[TMP29]], i32 40, i1 false) +// CHECK6-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK6: .omp.lastprivate.done: +// CHECK6-NEXT: br label [[OMP_PRECOND_END]] +// CHECK6: omp.precond.end: +// CHECK6-NEXT: ret void // diff --git a/clang/test/OpenMP/nvptx_lambda_capturing.cpp b/clang/test/OpenMP/nvptx_lambda_capturing.cpp --- a/clang/test/OpenMP/nvptx_lambda_capturing.cpp +++ b/clang/test/OpenMP/nvptx_lambda_capturing.cpp @@ -806,9 +806,9 @@ // CHECK2-NEXT: [[TMP0:%.*]] = load %struct.S*, %struct.S** [[THIS_ADDR]], align 8 // CHECK2-NEXT: [[TMP1:%.*]] = load %class.anon*, %class.anon** [[L_ADDR]], align 8 // CHECK2-NEXT: store %class.anon* [[TMP1]], %class.anon** [[TMP]], align 8 -// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG12:![0-9]+]] +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG13:![0-9]+]] +// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG14:![0-9]+]] // CHECK2-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] // CHECK2-NEXT: [[TMP2:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] // CHECK2-NEXT: br i1 [[TMP2]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] @@ -816,9 +816,9 @@ // CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1S3fooEv_l27_worker() #[[ATTR6:[0-9]+]] // CHECK2-NEXT: br label [[DOTEXIT:%.*]] // CHECK2: .mastercheck: -// CHECK2-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK2-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG12]] +// CHECK2-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG13]] +// CHECK2-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG14]] // CHECK2-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 // CHECK2-NEXT: [[TMP4:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 // CHECK2-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], -1 @@ -826,11 +826,10 @@ // CHECK2-NEXT: [[TMP6:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] // CHECK2-NEXT: br i1 [[TMP6]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] // CHECK2: .master: -// CHECK2-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG13]] +// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG14]] // CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] // CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK2-NEXT: [[TMP7:%.*]] = load %class.anon*, %class.anon** [[TMP]], align 8 // CHECK2-NEXT: [[TMP8:%.*]] = bitcast %class.anon* [[L7]] to i8* // CHECK2-NEXT: [[TMP9:%.*]] = bitcast %class.anon* [[TMP7]] to i8* @@ -875,9 +874,8 @@ // CHECK2-NEXT: [[TMP0:%.*]] = load %struct.S*, %struct.S** [[THIS_ADDR]], align 8 // CHECK2-NEXT: [[TMP1:%.*]] = load %class.anon*, %class.anon** [[L_ADDR]], align 8 // CHECK2-NEXT: store %class.anon* [[TMP1]], %class.anon** [[TMP]], align 8 -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG13]] // CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK2: .execute: // CHECK2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) @@ -995,9 +993,9 @@ // CHECK2-NEXT: store i32* [[TMP0]], i32** [[TMP]], align 8 // CHECK2-NEXT: store i32* [[TMP1]], i32** [[_TMP1]], align 8 // CHECK2-NEXT: store %class.anon.0* [[TMP3]], %class.anon.0** [[_TMP2]], align 8 -// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG12]] +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG13]] +// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG14]] // CHECK2-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] // CHECK2-NEXT: [[TMP4:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] // CHECK2-NEXT: br i1 [[TMP4]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] @@ -1005,9 +1003,9 @@ // CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_worker() #[[ATTR6]] // CHECK2-NEXT: br label [[DOTEXIT:%.*]] // CHECK2: .mastercheck: -// CHECK2-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK2-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG12]] +// CHECK2-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG13]] +// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG14]] // CHECK2-NEXT: [[TMP5:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE5]], 1 // CHECK2-NEXT: [[TMP6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], 1 // CHECK2-NEXT: [[TMP7:%.*]] = xor i32 [[TMP5]], -1 @@ -1015,11 +1013,10 @@ // CHECK2-NEXT: [[TMP8:%.*]] = icmp eq i32 [[NVPTX_TID3]], [[MASTER_TID]] // CHECK2-NEXT: br i1 [[TMP8]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] // CHECK2: .master: -// CHECK2-NEXT: [[NVPTX_NUM_THREADS6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: [[NVPTX_WARP_SIZE7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG13]] +// CHECK2-NEXT: [[NVPTX_WARP_SIZE7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG14]] // CHECK2-NEXT: [[THREAD_LIMIT8:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS6]], [[NVPTX_WARP_SIZE7]] // CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT8]], i16 1) -// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK2-NEXT: [[TMP9:%.*]] = load %class.anon.0*, %class.anon.0** [[_TMP2]], align 8 // CHECK2-NEXT: [[TMP10:%.*]] = bitcast %class.anon.0* [[L9]] to i8* // CHECK2-NEXT: [[TMP11:%.*]] = bitcast %class.anon.0* [[TMP9]] to i8* @@ -1084,9 +1081,8 @@ // CHECK2-NEXT: store i32* [[TMP1]], i32** [[TMP]], align 8 // CHECK2-NEXT: store i32* [[TMP2]], i32** [[_TMP1]], align 8 // CHECK2-NEXT: store %class.anon.0* [[TMP4]], %class.anon.0** [[_TMP2]], align 8 -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG13]] // CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK2: .execute: // CHECK2-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) @@ -1204,9 +1200,8 @@ // CHECK2-NEXT: store %class.anon* [[T]], %class.anon** [[T_ADDR]], align 8 // CHECK2-NEXT: [[TMP0:%.*]] = load %class.anon*, %class.anon** [[T_ADDR]], align 8 // CHECK2-NEXT: store %class.anon* [[TMP0]], %class.anon** [[TMP]], align 8 -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG13]] // CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK2: .execute: // CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) @@ -1316,9 +1311,9 @@ // CHECK3-NEXT: store i32* [[TMP0]], i32** [[TMP]], align 8 // CHECK3-NEXT: store i32* [[TMP1]], i32** [[_TMP1]], align 8 // CHECK3-NEXT: store %class.anon* [[TMP3]], %class.anon** [[_TMP2]], align 8 -// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG12:![0-9]+]] +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG13:![0-9]+]] +// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG14:![0-9]+]] // CHECK3-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] // CHECK3-NEXT: [[TMP4:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] // CHECK3-NEXT: br i1 [[TMP4]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] @@ -1326,9 +1321,9 @@ // CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_worker() #[[ATTR6:[0-9]+]] // CHECK3-NEXT: br label [[DOTEXIT:%.*]] // CHECK3: .mastercheck: -// CHECK3-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK3-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG12]] +// CHECK3-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG13]] +// CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG14]] // CHECK3-NEXT: [[TMP5:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE5]], 1 // CHECK3-NEXT: [[TMP6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], 1 // CHECK3-NEXT: [[TMP7:%.*]] = xor i32 [[TMP5]], -1 @@ -1336,11 +1331,10 @@ // CHECK3-NEXT: [[TMP8:%.*]] = icmp eq i32 [[NVPTX_TID3]], [[MASTER_TID]] // CHECK3-NEXT: br i1 [[TMP8]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] // CHECK3: .master: -// CHECK3-NEXT: [[NVPTX_NUM_THREADS6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: [[NVPTX_WARP_SIZE7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[NVPTX_NUM_THREADS6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG13]] +// CHECK3-NEXT: [[NVPTX_WARP_SIZE7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG14]] // CHECK3-NEXT: [[THREAD_LIMIT8:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS6]], [[NVPTX_WARP_SIZE7]] // CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT8]], i16 1) -// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK3-NEXT: [[TMP9:%.*]] = load %class.anon*, %class.anon** [[_TMP2]], align 8 // CHECK3-NEXT: [[TMP10:%.*]] = bitcast %class.anon* [[L9]] to i8* // CHECK3-NEXT: [[TMP11:%.*]] = bitcast %class.anon* [[TMP9]] to i8* @@ -1405,9 +1399,8 @@ // CHECK3-NEXT: store i32* [[TMP1]], i32** [[TMP]], align 8 // CHECK3-NEXT: store i32* [[TMP2]], i32** [[_TMP1]], align 8 // CHECK3-NEXT: store %class.anon* [[TMP4]], %class.anon** [[_TMP2]], align 8 -// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG13]] // CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK3: .execute: // CHECK3-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) @@ -1564,9 +1557,9 @@ // CHECK3-NEXT: [[TMP0:%.*]] = load %struct.S*, %struct.S** [[THIS_ADDR]], align 8 // CHECK3-NEXT: [[TMP1:%.*]] = load %class.anon.0*, %class.anon.0** [[L_ADDR]], align 8 // CHECK3-NEXT: store %class.anon.0* [[TMP1]], %class.anon.0** [[TMP]], align 8 -// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG12]] +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG13]] +// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG14]] // CHECK3-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] // CHECK3-NEXT: [[TMP2:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] // CHECK3-NEXT: br i1 [[TMP2]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] @@ -1574,9 +1567,9 @@ // CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1S3fooEv_l27_worker() #[[ATTR6]] // CHECK3-NEXT: br label [[DOTEXIT:%.*]] // CHECK3: .mastercheck: -// CHECK3-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK3-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG12]] +// CHECK3-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG13]] +// CHECK3-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG14]] // CHECK3-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 // CHECK3-NEXT: [[TMP4:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 // CHECK3-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], -1 @@ -1584,11 +1577,10 @@ // CHECK3-NEXT: [[TMP6:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] // CHECK3-NEXT: br i1 [[TMP6]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] // CHECK3: .master: -// CHECK3-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG13]] +// CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG14]] // CHECK3-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] // CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK3-NEXT: [[TMP7:%.*]] = load %class.anon.0*, %class.anon.0** [[TMP]], align 8 // CHECK3-NEXT: [[TMP8:%.*]] = bitcast %class.anon.0* [[L7]] to i8* // CHECK3-NEXT: [[TMP9:%.*]] = bitcast %class.anon.0* [[TMP7]] to i8* @@ -1633,9 +1625,8 @@ // CHECK3-NEXT: [[TMP0:%.*]] = load %struct.S*, %struct.S** [[THIS_ADDR]], align 8 // CHECK3-NEXT: [[TMP1:%.*]] = load %class.anon.0*, %class.anon.0** [[L_ADDR]], align 8 // CHECK3-NEXT: store %class.anon.0* [[TMP1]], %class.anon.0** [[TMP]], align 8 -// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG13]] // CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK3: .execute: // CHECK3-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) @@ -1695,9 +1686,8 @@ // CHECK3-NEXT: store %class.anon.0* [[T]], %class.anon.0** [[T_ADDR]], align 8 // CHECK3-NEXT: [[TMP0:%.*]] = load %class.anon.0*, %class.anon.0** [[T_ADDR]], align 8 // CHECK3-NEXT: store %class.anon.0* [[TMP0]], %class.anon.0** [[TMP]], align 8 -// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG13]] // CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK3: .execute: // CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) @@ -1807,9 +1797,9 @@ // CHECK4-NEXT: store i32* [[TMP0]], i32** [[TMP]], align 8 // CHECK4-NEXT: store i32* [[TMP1]], i32** [[_TMP1]], align 8 // CHECK4-NEXT: store %class.anon* [[TMP3]], %class.anon** [[_TMP2]], align 8 -// CHECK4-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK4-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK4-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG12:![0-9]+]] +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG13:![0-9]+]] +// CHECK4-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG14:![0-9]+]] // CHECK4-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] // CHECK4-NEXT: [[TMP4:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] // CHECK4-NEXT: br i1 [[TMP4]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] @@ -1817,9 +1807,9 @@ // CHECK4-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_worker() #[[ATTR6:[0-9]+]] // CHECK4-NEXT: br label [[DOTEXIT:%.*]] // CHECK4: .mastercheck: -// CHECK4-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK4-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK4-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK4-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG12]] +// CHECK4-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG13]] +// CHECK4-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG14]] // CHECK4-NEXT: [[TMP5:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE5]], 1 // CHECK4-NEXT: [[TMP6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], 1 // CHECK4-NEXT: [[TMP7:%.*]] = xor i32 [[TMP5]], -1 @@ -1827,11 +1817,10 @@ // CHECK4-NEXT: [[TMP8:%.*]] = icmp eq i32 [[NVPTX_TID3]], [[MASTER_TID]] // CHECK4-NEXT: br i1 [[TMP8]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] // CHECK4: .master: -// CHECK4-NEXT: [[NVPTX_NUM_THREADS6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK4-NEXT: [[NVPTX_WARP_SIZE7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK4-NEXT: [[NVPTX_NUM_THREADS6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG13]] +// CHECK4-NEXT: [[NVPTX_WARP_SIZE7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG14]] // CHECK4-NEXT: [[THREAD_LIMIT8:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS6]], [[NVPTX_WARP_SIZE7]] // CHECK4-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT8]], i16 1) -// CHECK4-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK4-NEXT: [[TMP9:%.*]] = load %class.anon*, %class.anon** [[_TMP2]], align 8 // CHECK4-NEXT: [[TMP10:%.*]] = bitcast %class.anon* [[L9]] to i8* // CHECK4-NEXT: [[TMP11:%.*]] = bitcast %class.anon* [[TMP9]] to i8* @@ -1896,9 +1885,8 @@ // CHECK4-NEXT: store i32* [[TMP1]], i32** [[TMP]], align 8 // CHECK4-NEXT: store i32* [[TMP2]], i32** [[_TMP1]], align 8 // CHECK4-NEXT: store %class.anon* [[TMP4]], %class.anon** [[_TMP2]], align 8 -// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG13]] // CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK4-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK4: .execute: // CHECK4-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) @@ -2055,9 +2043,9 @@ // CHECK4-NEXT: [[TMP0:%.*]] = load %struct.S*, %struct.S** [[THIS_ADDR]], align 8 // CHECK4-NEXT: [[TMP1:%.*]] = load %class.anon.0*, %class.anon.0** [[L_ADDR]], align 8 // CHECK4-NEXT: store %class.anon.0* [[TMP1]], %class.anon.0** [[TMP]], align 8 -// CHECK4-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK4-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK4-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG12]] +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG13]] +// CHECK4-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG14]] // CHECK4-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] // CHECK4-NEXT: [[TMP2:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] // CHECK4-NEXT: br i1 [[TMP2]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] @@ -2065,9 +2053,9 @@ // CHECK4-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1S3fooEv_l27_worker() #[[ATTR6]] // CHECK4-NEXT: br label [[DOTEXIT:%.*]] // CHECK4: .mastercheck: -// CHECK4-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK4-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK4-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK4-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG12]] +// CHECK4-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG13]] +// CHECK4-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG14]] // CHECK4-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 // CHECK4-NEXT: [[TMP4:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 // CHECK4-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], -1 @@ -2075,11 +2063,10 @@ // CHECK4-NEXT: [[TMP6:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] // CHECK4-NEXT: br i1 [[TMP6]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] // CHECK4: .master: -// CHECK4-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK4-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK4-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG13]] +// CHECK4-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG14]] // CHECK4-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] // CHECK4-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK4-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK4-NEXT: [[TMP7:%.*]] = load %class.anon.0*, %class.anon.0** [[TMP]], align 8 // CHECK4-NEXT: [[TMP8:%.*]] = bitcast %class.anon.0* [[L7]] to i8* // CHECK4-NEXT: [[TMP9:%.*]] = bitcast %class.anon.0* [[TMP7]] to i8* @@ -2124,9 +2111,8 @@ // CHECK4-NEXT: [[TMP0:%.*]] = load %struct.S*, %struct.S** [[THIS_ADDR]], align 8 // CHECK4-NEXT: [[TMP1:%.*]] = load %class.anon.0*, %class.anon.0** [[L_ADDR]], align 8 // CHECK4-NEXT: store %class.anon.0* [[TMP1]], %class.anon.0** [[TMP]], align 8 -// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG13]] // CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK4-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK4: .execute: // CHECK4-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) @@ -2186,9 +2172,8 @@ // CHECK4-NEXT: store %class.anon.0* [[T]], %class.anon.0** [[T_ADDR]], align 8 // CHECK4-NEXT: [[TMP0:%.*]] = load %class.anon.0*, %class.anon.0** [[T_ADDR]], align 8 // CHECK4-NEXT: store %class.anon.0* [[TMP0]], %class.anon.0** [[TMP]], align 8 -// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG13]] // CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK4-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK4: .execute: // CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) diff --git a/clang/test/OpenMP/nvptx_multi_target_parallel_codegen.cpp b/clang/test/OpenMP/nvptx_multi_target_parallel_codegen.cpp --- a/clang/test/OpenMP/nvptx_multi_target_parallel_codegen.cpp +++ b/clang/test/OpenMP/nvptx_multi_target_parallel_codegen.cpp @@ -31,7 +31,6 @@ // CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 // CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK1: .execute: // CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) @@ -139,7 +138,6 @@ // CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK1-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] // CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK1-NEXT: call void @_Z3usev() #[[ATTR7]] // CHECK1-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] // CHECK1: .termination.notifier: @@ -156,7 +154,6 @@ // CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4 // CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK2: .execute: // CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) @@ -264,7 +261,6 @@ // CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] // CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK2-NEXT: call void @_Z3usev() #[[ATTR7]] // CHECK2-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] // CHECK2: .termination.notifier: @@ -281,7 +277,6 @@ // CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4 // CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK3: .execute: // CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) @@ -389,7 +384,6 @@ // CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK3-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] // CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK3-NEXT: call void @_Z3usev() #[[ATTR7]] // CHECK3-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] // CHECK3: .termination.notifier: diff --git a/clang/test/OpenMP/nvptx_nested_parallel_codegen.cpp b/clang/test/OpenMP/nvptx_nested_parallel_codegen.cpp --- a/clang/test/OpenMP/nvptx_nested_parallel_codegen.cpp +++ b/clang/test/OpenMP/nvptx_nested_parallel_codegen.cpp @@ -113,7 +113,6 @@ // CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK1-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] // CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK1-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) // CHECK1-NEXT: call void @_Z3usePi(i32* [[TMP0]]) #[[ATTR7:[0-9]+]] // CHECK1-NEXT: call void @__kmpc_push_num_threads(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 2) @@ -319,7 +318,6 @@ // CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] // CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK2-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) // CHECK2-NEXT: call void @_Z3usePi(i32* [[TMP0]]) #[[ATTR7:[0-9]+]] // CHECK2-NEXT: call void @__kmpc_push_num_threads(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 2) @@ -525,7 +523,6 @@ // CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK3-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] // CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK3-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) // CHECK3-NEXT: call void @_Z3usePi(i32* [[TMP0]]) #[[ATTR7:[0-9]+]] // CHECK3-NEXT: call void @__kmpc_push_num_threads(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 2) diff --git a/clang/test/OpenMP/nvptx_parallel_codegen.cpp b/clang/test/OpenMP/nvptx_parallel_codegen.cpp --- a/clang/test/OpenMP/nvptx_parallel_codegen.cpp +++ b/clang/test/OpenMP/nvptx_parallel_codegen.cpp @@ -1,12 +1,10 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-function-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" // Test target codegen - host bc file has to be created first. // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK1 -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK2 +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK1 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -aux-triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns -disable-O0-optnone | FileCheck %s --check-prefix CHECK4 -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -aux-triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK5 -// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -aux-triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK6 +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK2 +// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK3 // expected-no-diagnostics #ifndef HEADER #define HEADER @@ -76,989 +74,6 @@ } #endif -// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29_worker -// CHECK1-SAME: () #[[ATTR0:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 -// CHECK1-NEXT: store i8* null, i8** [[WORK_FN]], align 8 -// CHECK1-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 -// CHECK1-NEXT: br label [[DOTAWAIT_WORK:%.*]] -// CHECK1: .await.work: -// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK1-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) -// CHECK1-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 -// CHECK1-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 -// CHECK1-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 -// CHECK1-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null -// CHECK1-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] -// CHECK1: .select.workers: -// CHECK1-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 -// CHECK1-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK1-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] -// CHECK1: .execute.parallel: -// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) -// CHECK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 8 -// CHECK1-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*) -// CHECK1-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] -// CHECK1: .execute.fn: -// CHECK1-NEXT: call void @__omp_outlined___wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3:[0-9]+]] -// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] -// CHECK1: .check.next: -// CHECK1-NEXT: [[TMP6:%.*]] = load i8*, i8** [[WORK_FN]], align 8 -// CHECK1-NEXT: [[WORK_MATCH1:%.*]] = icmp eq i8* [[TMP6]], bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*) -// CHECK1-NEXT: br i1 [[WORK_MATCH1]], label [[DOTEXECUTE_FN2:%.*]], label [[DOTCHECK_NEXT3:%.*]] -// CHECK1: .execute.fn2: -// CHECK1-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] -// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL]] -// CHECK1: .check.next3: -// CHECK1-NEXT: [[TMP7:%.*]] = load i8*, i8** [[WORK_FN]], align 8 -// CHECK1-NEXT: [[WORK_MATCH4:%.*]] = icmp eq i8* [[TMP7]], bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*) -// CHECK1-NEXT: br i1 [[WORK_MATCH4]], label [[DOTEXECUTE_FN5:%.*]], label [[DOTCHECK_NEXT6:%.*]] -// CHECK1: .execute.fn5: -// CHECK1-NEXT: call void @__omp_outlined__2_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] -// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL]] -// CHECK1: .check.next6: -// CHECK1-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* -// CHECK1-NEXT: call void [[TMP8]](i16 0, i32 [[TMP4]]) -// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL]] -// CHECK1: .terminate.parallel: -// CHECK1-NEXT: call void @__kmpc_kernel_end_parallel() -// CHECK1-NEXT: br label [[DOTBARRIER_PARALLEL]] -// CHECK1: .barrier.parallel: -// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK1-NEXT: br label [[DOTAWAIT_WORK]] -// CHECK1: .exit: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29 -// CHECK1-SAME: (i64 [[A:%.*]]) #[[ATTR1:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 -// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS7:%.*]] = alloca [0 x i8*], align 8 -// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS8:%.*]] = alloca [0 x i8*], align 8 -// CHECK1-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 -// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32* -// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK1-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] -// CHECK1-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] -// CHECK1-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] -// CHECK1: .worker: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29_worker() #[[ATTR3]] -// CHECK1-NEXT: br label [[DOTEXIT:%.*]] -// CHECK1: .mastercheck: -// CHECK1-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK1-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK1-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 -// CHECK1-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 -// CHECK1-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 -// CHECK1-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] -// CHECK1-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] -// CHECK1-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] -// CHECK1: .master: -// CHECK1-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK1-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] -// CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK1-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK1-NEXT: [[TMP6:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP6]], i64 0) -// CHECK1-NEXT: [[TMP7:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS7]] to i8** -// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 0, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP7]], i64 0) -// CHECK1-NEXT: [[TMP8:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS8]] to i8** -// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** [[TMP8]], i64 0) -// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP9]], 1 -// CHECK1-NEXT: store i32 [[ADD]], i32* [[CONV]], align 8 -// CHECK1-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] -// CHECK1: .termination.notifier: -// CHECK1-NEXT: call void @__kmpc_kernel_deinit(i16 1) -// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK1-NEXT: br label [[DOTEXIT]] -// CHECK1: .exit: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[A:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32 42, i32* [[A]], align 4 -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined___wrapper -// CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 -// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK1-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -// CHECK1-NEXT: call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[A:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32 43, i32* [[A]], align 4 -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper -// CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 -// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK1-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -// CHECK1-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__2 -// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[A:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32 44, i32* [[A]], align 4 -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__2_wrapper -// CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 -// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK1-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -// CHECK1-NEXT: call void @__omp_outlined__2(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_worker -// CHECK1-SAME: () #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 -// CHECK1-NEXT: store i8* null, i8** [[WORK_FN]], align 8 -// CHECK1-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 -// CHECK1-NEXT: br label [[DOTAWAIT_WORK:%.*]] -// CHECK1: .await.work: -// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK1-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) -// CHECK1-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 -// CHECK1-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 -// CHECK1-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 -// CHECK1-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null -// CHECK1-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] -// CHECK1: .select.workers: -// CHECK1-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 -// CHECK1-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK1-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] -// CHECK1: .execute.parallel: -// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 8 -// CHECK1-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*) -// CHECK1-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] -// CHECK1: .execute.fn: -// CHECK1-NEXT: call void @__omp_outlined__3_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] -// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] -// CHECK1: .check.next: -// CHECK1-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* -// CHECK1-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) -// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL]] -// CHECK1: .terminate.parallel: -// CHECK1-NEXT: call void @__kmpc_kernel_end_parallel() -// CHECK1-NEXT: br label [[DOTBARRIER_PARALLEL]] -// CHECK1: .barrier.parallel: -// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK1-NEXT: br label [[DOTAWAIT_WORK]] -// CHECK1: .exit: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46 -// CHECK1-SAME: (i64 [[N:%.*]], i64 [[A:%.*]], i64 [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[AA_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 -// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 -// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK1-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 -// CHECK1-NEXT: store i64 [[AA]], i64* [[AA_ADDR]], align 8 -// CHECK1-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 -// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[A_ADDR]] to i32* -// CHECK1-NEXT: [[CONV2:%.*]] = bitcast i64* [[AA_ADDR]] to i16* -// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 -// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK1-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] -// CHECK1-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] -// CHECK1-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] -// CHECK1: .worker: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_worker() #[[ATTR3]] -// CHECK1-NEXT: br label [[DOTEXIT:%.*]] -// CHECK1: .mastercheck: -// CHECK1-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK1-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK1-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE5]], 1 -// CHECK1-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], 1 -// CHECK1-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1 -// CHECK1-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]] -// CHECK1-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID3]], [[MASTER_TID]] -// CHECK1-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] -// CHECK1: .master: -// CHECK1-NEXT: [[NVPTX_NUM_THREADS6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: [[NVPTX_WARP_SIZE7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK1-NEXT: [[THREAD_LIMIT8:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS6]], [[NVPTX_WARP_SIZE7]] -// CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT8]], i16 1) -// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK1-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP7]], 1000 -// CHECK1-NEXT: [[TMP8:%.*]] = zext i1 [[CMP]] to i32 -// CHECK1-NEXT: [[TMP9:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 [[TMP8]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** [[TMP9]], i64 0) -// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[CONV1]], align 8 -// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP10]], 1 -// CHECK1-NEXT: store i32 [[ADD]], i32* [[CONV1]], align 8 -// CHECK1-NEXT: [[TMP11:%.*]] = load i16, i16* [[CONV2]], align 8 -// CHECK1-NEXT: [[CONV9:%.*]] = sext i16 [[TMP11]] to i32 -// CHECK1-NEXT: [[ADD10:%.*]] = add nsw i32 [[CONV9]], 1 -// CHECK1-NEXT: [[CONV11:%.*]] = trunc i32 [[ADD10]] to i16 -// CHECK1-NEXT: store i16 [[CONV11]], i16* [[CONV2]], align 8 -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i64 0, i64 2 -// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -// CHECK1-NEXT: [[ADD12:%.*]] = add nsw i32 [[TMP12]], 1 -// CHECK1-NEXT: store i32 [[ADD12]], i32* [[ARRAYIDX]], align 4 -// CHECK1-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] -// CHECK1: .termination.notifier: -// CHECK1-NEXT: call void @__kmpc_kernel_deinit(i16 1) -// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK1-NEXT: br label [[DOTEXIT]] -// CHECK1: .exit: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__3 -// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[A:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32 45, i32* [[A]], align 4 -// CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 -// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP1]]) -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper -// CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 -// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK1-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -// CHECK1-NEXT: call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58_worker -// CHECK1-SAME: () #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 -// CHECK1-NEXT: store i8* null, i8** [[WORK_FN]], align 8 -// CHECK1-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 -// CHECK1-NEXT: br label [[DOTAWAIT_WORK:%.*]] -// CHECK1: .await.work: -// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK1-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) -// CHECK1-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 -// CHECK1-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 -// CHECK1-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 -// CHECK1-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null -// CHECK1-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] -// CHECK1: .select.workers: -// CHECK1-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 -// CHECK1-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK1-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] -// CHECK1: .execute.parallel: -// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 8 -// CHECK1-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*) -// CHECK1-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] -// CHECK1: .execute.fn: -// CHECK1-NEXT: call void @__omp_outlined__4_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] -// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] -// CHECK1: .check.next: -// CHECK1-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* -// CHECK1-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) -// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL]] -// CHECK1: .terminate.parallel: -// CHECK1-NEXT: call void @__kmpc_kernel_end_parallel() -// CHECK1-NEXT: br label [[DOTBARRIER_PARALLEL]] -// CHECK1: .barrier.parallel: -// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK1-NEXT: br label [[DOTAWAIT_WORK]] -// CHECK1: .exit: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58 -// CHECK1-SAME: (i64 [[A:%.*]]) #[[ATTR1]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8 -// CHECK1-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 -// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32* -// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK1-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] -// CHECK1-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] -// CHECK1-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] -// CHECK1: .worker: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58_worker() #[[ATTR3]] -// CHECK1-NEXT: br label [[DOTEXIT:%.*]] -// CHECK1: .mastercheck: -// CHECK1-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK1-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK1-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 -// CHECK1-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 -// CHECK1-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 -// CHECK1-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] -// CHECK1-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] -// CHECK1-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] -// CHECK1: .master: -// CHECK1-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK1-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] -// CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK1-NEXT: [[TMP5:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK1-NEXT: [[TMP6:%.*]] = load i64, i64* @"_openmp_static_kernel$size", align 8 -// CHECK1-NEXT: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i64 [[TMP6]], i16 [[TMP5]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) -// CHECK1-NEXT: [[TMP7:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 8 -// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP7]], i64 0 -// CHECK1-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to %struct._globalized_locals_ty* -// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: [[A7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP9]], i32 0, i32 0 -// CHECK1-NEXT: store i32 [[TMP10]], i32* [[A7]], align 4 -// CHECK1-NEXT: [[TMP11:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP13:%.*]] = bitcast i32* [[A7]] to i8* -// CHECK1-NEXT: store i8* [[TMP13]], i8** [[TMP12]], align 8 -// CHECK1-NEXT: [[TMP14:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP11]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__4 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*), i8** [[TMP14]], i64 1) -// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[A7]], align 4 -// CHECK1-NEXT: [[INC:%.*]] = add nsw i32 [[TMP15]], 1 -// CHECK1-NEXT: store i32 [[INC]], i32* [[A7]], align 4 -// CHECK1-NEXT: [[TMP16:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK1-NEXT: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP16]]) -// CHECK1-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] -// CHECK1: .termination.notifier: -// CHECK1-NEXT: call void @__kmpc_kernel_deinit(i16 1) -// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK1-NEXT: br label [[DOTEXIT]] -// CHECK1: .exit: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__4 -// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR1]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[CRITICAL_COUNTER:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 -// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_warp_active_thread_mask() -// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: store i32 0, i32* [[CRITICAL_COUNTER]], align 4 -// CHECK1-NEXT: br label [[OMP_CRITICAL_LOOP:%.*]] -// CHECK1: omp.critical.loop: -// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4 -// CHECK1-NEXT: [[TMP3:%.*]] = icmp slt i32 [[TMP2]], [[NVPTX_NUM_THREADS]] -// CHECK1-NEXT: br i1 [[TMP3]], label [[OMP_CRITICAL_TEST:%.*]], label [[OMP_CRITICAL_EXIT:%.*]] -// CHECK1: omp.critical.test: -// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4 -// CHECK1-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID]], [[TMP4]] -// CHECK1-NEXT: br i1 [[TMP5]], label [[OMP_CRITICAL_BODY:%.*]], label [[OMP_CRITICAL_SYNC:%.*]] -// CHECK1: omp.critical.body: -// CHECK1-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 -// CHECK1-NEXT: call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var") -// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP0]], align 4 -// CHECK1-NEXT: [[INC:%.*]] = add nsw i32 [[TMP8]], 1 -// CHECK1-NEXT: store i32 [[INC]], i32* [[TMP0]], align 4 -// CHECK1-NEXT: call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var") -// CHECK1-NEXT: br label [[OMP_CRITICAL_SYNC]] -// CHECK1: omp.critical.sync: -// CHECK1-NEXT: call void @__kmpc_syncwarp(i32 [[TMP1]]) -// CHECK1-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP4]], 1 -// CHECK1-NEXT: store i32 [[TMP9]], i32* [[CRITICAL_COUNTER]], align 4 -// CHECK1-NEXT: br label [[OMP_CRITICAL_LOOP]] -// CHECK1: omp.critical.exit: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__4_wrapper -// CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 -// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK1-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -// CHECK1-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8 -// CHECK1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 0 -// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** -// CHECK1-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 8 -// CHECK1-NEXT: call void @__omp_outlined__4(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]] -// CHECK1-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29_worker -// CHECK2-SAME: () #[[ATTR0:[0-9]+]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 -// CHECK2-NEXT: store i8* null, i8** [[WORK_FN]], align 8 -// CHECK2-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 -// CHECK2-NEXT: br label [[DOTAWAIT_WORK:%.*]] -// CHECK2: .await.work: -// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK2-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) -// CHECK2-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 -// CHECK2-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 -// CHECK2-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 -// CHECK2-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null -// CHECK2-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] -// CHECK2: .select.workers: -// CHECK2-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 -// CHECK2-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK2-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] -// CHECK2: .execute.parallel: -// CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) -// CHECK2-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 8 -// CHECK2-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*) -// CHECK2-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] -// CHECK2: .execute.fn: -// CHECK2-NEXT: call void @__omp_outlined___wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3:[0-9]+]] -// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] -// CHECK2: .check.next: -// CHECK2-NEXT: [[TMP6:%.*]] = load i8*, i8** [[WORK_FN]], align 8 -// CHECK2-NEXT: [[WORK_MATCH1:%.*]] = icmp eq i8* [[TMP6]], bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*) -// CHECK2-NEXT: br i1 [[WORK_MATCH1]], label [[DOTEXECUTE_FN2:%.*]], label [[DOTCHECK_NEXT3:%.*]] -// CHECK2: .execute.fn2: -// CHECK2-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] -// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL]] -// CHECK2: .check.next3: -// CHECK2-NEXT: [[TMP7:%.*]] = load i8*, i8** [[WORK_FN]], align 8 -// CHECK2-NEXT: [[WORK_MATCH4:%.*]] = icmp eq i8* [[TMP7]], bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*) -// CHECK2-NEXT: br i1 [[WORK_MATCH4]], label [[DOTEXECUTE_FN5:%.*]], label [[DOTCHECK_NEXT6:%.*]] -// CHECK2: .execute.fn5: -// CHECK2-NEXT: call void @__omp_outlined__2_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] -// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL]] -// CHECK2: .check.next6: -// CHECK2-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* -// CHECK2-NEXT: call void [[TMP8]](i16 0, i32 [[TMP4]]) -// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL]] -// CHECK2: .terminate.parallel: -// CHECK2-NEXT: call void @__kmpc_kernel_end_parallel() -// CHECK2-NEXT: br label [[DOTBARRIER_PARALLEL]] -// CHECK2: .barrier.parallel: -// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK2-NEXT: br label [[DOTAWAIT_WORK]] -// CHECK2: .exit: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29 -// CHECK2-SAME: (i64 [[A:%.*]]) #[[ATTR1:[0-9]+]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 -// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS7:%.*]] = alloca [0 x i8*], align 8 -// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS8:%.*]] = alloca [0 x i8*], align 8 -// CHECK2-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 -// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32* -// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK2-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] -// CHECK2-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] -// CHECK2-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] -// CHECK2: .worker: -// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29_worker() #[[ATTR3]] -// CHECK2-NEXT: br label [[DOTEXIT:%.*]] -// CHECK2: .mastercheck: -// CHECK2-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK2-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK2-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 -// CHECK2-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 -// CHECK2-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 -// CHECK2-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] -// CHECK2-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] -// CHECK2-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] -// CHECK2: .master: -// CHECK2-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] -// CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK2-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK2-NEXT: [[TMP6:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP6]], i64 0) -// CHECK2-NEXT: [[TMP7:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS7]] to i8** -// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 0, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP7]], i64 0) -// CHECK2-NEXT: [[TMP8:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS8]] to i8** -// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** [[TMP8]], i64 0) -// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP9]], 1 -// CHECK2-NEXT: store i32 [[ADD]], i32* [[CONV]], align 8 -// CHECK2-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] -// CHECK2: .termination.notifier: -// CHECK2-NEXT: call void @__kmpc_kernel_deinit(i16 1) -// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK2-NEXT: br label [[DOTEXIT]] -// CHECK2: .exit: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[A:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32 42, i32* [[A]], align 4 -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined___wrapper -// CHECK2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 -// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK2-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -// CHECK2-NEXT: call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[A:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32 43, i32* [[A]], align 4 -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper -// CHECK2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 -// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK2-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -// CHECK2-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__2 -// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[A:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32 44, i32* [[A]], align 4 -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__2_wrapper -// CHECK2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 -// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK2-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -// CHECK2-NEXT: call void @__omp_outlined__2(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_worker -// CHECK2-SAME: () #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 -// CHECK2-NEXT: store i8* null, i8** [[WORK_FN]], align 8 -// CHECK2-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 -// CHECK2-NEXT: br label [[DOTAWAIT_WORK:%.*]] -// CHECK2: .await.work: -// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK2-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) -// CHECK2-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 -// CHECK2-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 -// CHECK2-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 -// CHECK2-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null -// CHECK2-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] -// CHECK2: .select.workers: -// CHECK2-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 -// CHECK2-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK2-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] -// CHECK2: .execute.parallel: -// CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK2-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 8 -// CHECK2-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*) -// CHECK2-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] -// CHECK2: .execute.fn: -// CHECK2-NEXT: call void @__omp_outlined__3_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] -// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] -// CHECK2: .check.next: -// CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* -// CHECK2-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) -// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL]] -// CHECK2: .terminate.parallel: -// CHECK2-NEXT: call void @__kmpc_kernel_end_parallel() -// CHECK2-NEXT: br label [[DOTBARRIER_PARALLEL]] -// CHECK2: .barrier.parallel: -// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK2-NEXT: br label [[DOTAWAIT_WORK]] -// CHECK2: .exit: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46 -// CHECK2-SAME: (i64 [[N:%.*]], i64 [[A:%.*]], i64 [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[AA_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 -// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 -// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK2-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 -// CHECK2-NEXT: store i64 [[AA]], i64* [[AA_ADDR]], align 8 -// CHECK2-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 -// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK2-NEXT: [[CONV1:%.*]] = bitcast i64* [[A_ADDR]] to i32* -// CHECK2-NEXT: [[CONV2:%.*]] = bitcast i64* [[AA_ADDR]] to i16* -// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 -// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK2-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] -// CHECK2-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] -// CHECK2-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] -// CHECK2: .worker: -// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_worker() #[[ATTR3]] -// CHECK2-NEXT: br label [[DOTEXIT:%.*]] -// CHECK2: .mastercheck: -// CHECK2-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK2-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK2-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE5]], 1 -// CHECK2-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], 1 -// CHECK2-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1 -// CHECK2-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]] -// CHECK2-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID3]], [[MASTER_TID]] -// CHECK2-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] -// CHECK2: .master: -// CHECK2-NEXT: [[NVPTX_NUM_THREADS6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: [[NVPTX_WARP_SIZE7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK2-NEXT: [[THREAD_LIMIT8:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS6]], [[NVPTX_WARP_SIZE7]] -// CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT8]], i16 1) -// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK2-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP7]], 1000 -// CHECK2-NEXT: [[TMP8:%.*]] = zext i1 [[CMP]] to i32 -// CHECK2-NEXT: [[TMP9:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 [[TMP8]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** [[TMP9]], i64 0) -// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[CONV1]], align 8 -// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP10]], 1 -// CHECK2-NEXT: store i32 [[ADD]], i32* [[CONV1]], align 8 -// CHECK2-NEXT: [[TMP11:%.*]] = load i16, i16* [[CONV2]], align 8 -// CHECK2-NEXT: [[CONV9:%.*]] = sext i16 [[TMP11]] to i32 -// CHECK2-NEXT: [[ADD10:%.*]] = add nsw i32 [[CONV9]], 1 -// CHECK2-NEXT: [[CONV11:%.*]] = trunc i32 [[ADD10]] to i16 -// CHECK2-NEXT: store i16 [[CONV11]], i16* [[CONV2]], align 8 -// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i64 0, i64 2 -// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -// CHECK2-NEXT: [[ADD12:%.*]] = add nsw i32 [[TMP12]], 1 -// CHECK2-NEXT: store i32 [[ADD12]], i32* [[ARRAYIDX]], align 4 -// CHECK2-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] -// CHECK2: .termination.notifier: -// CHECK2-NEXT: call void @__kmpc_kernel_deinit(i16 1) -// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK2-NEXT: br label [[DOTEXIT]] -// CHECK2: .exit: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__3 -// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[A:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32 45, i32* [[A]], align 4 -// CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 -// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP1]]) -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper -// CHECK2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 -// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK2-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -// CHECK2-NEXT: call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58_worker -// CHECK2-SAME: () #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 -// CHECK2-NEXT: store i8* null, i8** [[WORK_FN]], align 8 -// CHECK2-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 -// CHECK2-NEXT: br label [[DOTAWAIT_WORK:%.*]] -// CHECK2: .await.work: -// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK2-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) -// CHECK2-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 -// CHECK2-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 -// CHECK2-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 -// CHECK2-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null -// CHECK2-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] -// CHECK2: .select.workers: -// CHECK2-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 -// CHECK2-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK2-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] -// CHECK2: .execute.parallel: -// CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK2-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 8 -// CHECK2-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*) -// CHECK2-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] -// CHECK2: .execute.fn: -// CHECK2-NEXT: call void @__omp_outlined__4_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] -// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] -// CHECK2: .check.next: -// CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* -// CHECK2-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) -// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL]] -// CHECK2: .terminate.parallel: -// CHECK2-NEXT: call void @__kmpc_kernel_end_parallel() -// CHECK2-NEXT: br label [[DOTBARRIER_PARALLEL]] -// CHECK2: .barrier.parallel: -// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK2-NEXT: br label [[DOTAWAIT_WORK]] -// CHECK2: .exit: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58 -// CHECK2-SAME: (i64 [[A:%.*]]) #[[ATTR1]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8 -// CHECK2-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 -// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32* -// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK2-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] -// CHECK2-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] -// CHECK2-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] -// CHECK2: .worker: -// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58_worker() #[[ATTR3]] -// CHECK2-NEXT: br label [[DOTEXIT:%.*]] -// CHECK2: .mastercheck: -// CHECK2-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK2-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK2-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 -// CHECK2-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 -// CHECK2-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 -// CHECK2-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] -// CHECK2-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] -// CHECK2-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] -// CHECK2: .master: -// CHECK2-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] -// CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK2-NEXT: [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i64 4, i16 1) -// CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty* -// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: [[A7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP6]], i32 0, i32 0 -// CHECK2-NEXT: store i32 [[TMP7]], i32* [[A7]], align 4 -// CHECK2-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP10:%.*]] = bitcast i32* [[A7]] to i8* -// CHECK2-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 8 -// CHECK2-NEXT: [[TMP11:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP8]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__4 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*), i8** [[TMP11]], i64 1) -// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[A7]], align 4 -// CHECK2-NEXT: [[INC:%.*]] = add nsw i32 [[TMP12]], 1 -// CHECK2-NEXT: store i32 [[INC]], i32* [[A7]], align 4 -// CHECK2-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]]) -// CHECK2-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] -// CHECK2: .termination.notifier: -// CHECK2-NEXT: call void @__kmpc_kernel_deinit(i16 1) -// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK2-NEXT: br label [[DOTEXIT]] -// CHECK2: .exit: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__4 -// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR1]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[CRITICAL_COUNTER:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 -// CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 -// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_warp_active_thread_mask() -// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: store i32 0, i32* [[CRITICAL_COUNTER]], align 4 -// CHECK2-NEXT: br label [[OMP_CRITICAL_LOOP:%.*]] -// CHECK2: omp.critical.loop: -// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4 -// CHECK2-NEXT: [[TMP3:%.*]] = icmp slt i32 [[TMP2]], [[NVPTX_NUM_THREADS]] -// CHECK2-NEXT: br i1 [[TMP3]], label [[OMP_CRITICAL_TEST:%.*]], label [[OMP_CRITICAL_EXIT:%.*]] -// CHECK2: omp.critical.test: -// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4 -// CHECK2-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID]], [[TMP4]] -// CHECK2-NEXT: br i1 [[TMP5]], label [[OMP_CRITICAL_BODY:%.*]], label [[OMP_CRITICAL_SYNC:%.*]] -// CHECK2: omp.critical.body: -// CHECK2-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 -// CHECK2-NEXT: call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var") -// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP0]], align 4 -// CHECK2-NEXT: [[INC:%.*]] = add nsw i32 [[TMP8]], 1 -// CHECK2-NEXT: store i32 [[INC]], i32* [[TMP0]], align 4 -// CHECK2-NEXT: call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var") -// CHECK2-NEXT: br label [[OMP_CRITICAL_SYNC]] -// CHECK2: omp.critical.sync: -// CHECK2-NEXT: call void @__kmpc_syncwarp(i32 [[TMP1]]) -// CHECK2-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP4]], 1 -// CHECK2-NEXT: store i32 [[TMP9]], i32* [[CRITICAL_COUNTER]], align 4 -// CHECK2-NEXT: br label [[OMP_CRITICAL_LOOP]] -// CHECK2: omp.critical.exit: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__4_wrapper -// CHECK2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 -// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK2-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -// CHECK2-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8 -// CHECK2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 0 -// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** -// CHECK2-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 8 -// CHECK2-NEXT: call void @__omp_outlined__4(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]] -// CHECK2-NEXT: ret void -// -// // CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29_worker // CHECK4-SAME: () #[[ATTR0:[0-9]+]] { // CHECK4-NEXT: entry: @@ -1113,8 +128,6 @@ // CHECK4-NEXT: br label [[DOTAWAIT_WORK]] // CHECK4: .exit: // CHECK4-NEXT: ret void -// -// // CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29 // CHECK4-SAME: (i32 [[A:%.*]]) #[[ATTR0]] { // CHECK4-NEXT: entry: @@ -1165,8 +178,6 @@ // CHECK4-NEXT: br label [[DOTEXIT]] // CHECK4: .exit: // CHECK4-NEXT: ret void -// -// // CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__ // CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { // CHECK4-NEXT: entry: @@ -1177,8 +188,6 @@ // CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 // CHECK4-NEXT: store i32 42, i32* [[A]], align 4 // CHECK4-NEXT: ret void -// -// // CHECK4-LABEL: define {{[^@]+}}@__omp_outlined___wrapper // CHECK4-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { // CHECK4-NEXT: entry: @@ -1192,8 +201,6 @@ // CHECK4-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) // CHECK4-NEXT: call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]] // CHECK4-NEXT: ret void -// -// // CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__1 // CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { // CHECK4-NEXT: entry: @@ -1204,8 +211,6 @@ // CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 // CHECK4-NEXT: store i32 43, i32* [[A]], align 4 // CHECK4-NEXT: ret void -// -// // CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper // CHECK4-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { // CHECK4-NEXT: entry: @@ -1219,8 +224,6 @@ // CHECK4-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) // CHECK4-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]] // CHECK4-NEXT: ret void -// -// // CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__2 // CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { // CHECK4-NEXT: entry: @@ -1231,8 +234,6 @@ // CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 // CHECK4-NEXT: store i32 44, i32* [[A]], align 4 // CHECK4-NEXT: ret void -// -// // CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__2_wrapper // CHECK4-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { // CHECK4-NEXT: entry: @@ -1246,8 +247,6 @@ // CHECK4-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) // CHECK4-NEXT: call void @__omp_outlined__2(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]] // CHECK4-NEXT: ret void -// -// // CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_worker // CHECK4-SAME: () #[[ATTR0]] { // CHECK4-NEXT: entry: @@ -1288,8 +287,6 @@ // CHECK4-NEXT: br label [[DOTAWAIT_WORK]] // CHECK4: .exit: // CHECK4-NEXT: ret void -// -// // CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46 // CHECK4-SAME: (i32 [[N:%.*]], i32 [[A:%.*]], i32 [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { // CHECK4-NEXT: entry: @@ -1354,8 +351,6 @@ // CHECK4-NEXT: br label [[DOTEXIT]] // CHECK4: .exit: // CHECK4-NEXT: ret void -// -// // CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__3 // CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { // CHECK4-NEXT: entry: @@ -1369,8 +364,6 @@ // CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 // CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP1]]) // CHECK4-NEXT: ret void -// -// // CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper // CHECK4-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { // CHECK4-NEXT: entry: @@ -1384,8 +377,6 @@ // CHECK4-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) // CHECK4-NEXT: call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]] // CHECK4-NEXT: ret void -// -// // CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58_worker // CHECK4-SAME: () #[[ATTR0]] { // CHECK4-NEXT: entry: @@ -1426,8 +417,6 @@ // CHECK4-NEXT: br label [[DOTAWAIT_WORK]] // CHECK4: .exit: // CHECK4-NEXT: ret void -// -// // CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58 // CHECK4-SAME: (i32 [[A:%.*]]) #[[ATTR0]] { // CHECK4-NEXT: entry: @@ -1486,8 +475,6 @@ // CHECK4-NEXT: br label [[DOTEXIT]] // CHECK4: .exit: // CHECK4-NEXT: ret void -// -// // CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__4 // CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR0]] { // CHECK4-NEXT: entry: @@ -1528,8 +515,6 @@ // CHECK4-NEXT: br label [[OMP_CRITICAL_LOOP]] // CHECK4: omp.critical.exit: // CHECK4-NEXT: ret void -// -// // CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__4_wrapper // CHECK4-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { // CHECK4-NEXT: entry: @@ -1547,8 +532,6 @@ // CHECK4-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 4 // CHECK4-NEXT: call void @__omp_outlined__4(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR2]] // CHECK4-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29_worker // CHECK5-SAME: () #[[ATTR0:[0-9]+]] { // CHECK5-NEXT: entry: @@ -1570,30 +553,189 @@ // CHECK5-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 // CHECK5-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] // CHECK5: .execute.parallel: -// CHECK5-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK5-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK5-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK5-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*) +// CHECK5-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] +// CHECK5: .execute.fn: +// CHECK5-NEXT: call void @__omp_outlined___wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3:[0-9]+]] +// CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK5: .check.next: +// CHECK5-NEXT: [[TMP6:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK5-NEXT: [[WORK_MATCH1:%.*]] = icmp eq i8* [[TMP6]], bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*) +// CHECK5-NEXT: br i1 [[WORK_MATCH1]], label [[DOTEXECUTE_FN2:%.*]], label [[DOTCHECK_NEXT3:%.*]] +// CHECK5: .execute.fn2: +// CHECK5-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] +// CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK5: .check.next3: +// CHECK5-NEXT: [[TMP7:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK5-NEXT: [[WORK_MATCH4:%.*]] = icmp eq i8* [[TMP7]], bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*) +// CHECK5-NEXT: br i1 [[WORK_MATCH4]], label [[DOTEXECUTE_FN5:%.*]], label [[DOTCHECK_NEXT6:%.*]] +// CHECK5: .execute.fn5: +// CHECK5-NEXT: call void @__omp_outlined__2_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] +// CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK5: .check.next6: +// CHECK5-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK5-NEXT: call void [[TMP8]](i16 0, i32 [[TMP4]]) +// CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK5: .terminate.parallel: +// CHECK5-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK5-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK5: .barrier.parallel: +// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK5-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK5: .exit: +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29 +// CHECK5-SAME: (i32 [[A:%.*]]) #[[ATTR1:[0-9]+]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4 +// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS7:%.*]] = alloca [0 x i8*], align 4 +// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS8:%.*]] = alloca [0 x i8*], align 4 +// CHECK5-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +// CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK5-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK5-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK5-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK5: .worker: +// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29_worker() #[[ATTR3]] +// CHECK5-NEXT: br label [[DOTEXIT:%.*]] +// CHECK5: .mastercheck: +// CHECK5-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK5-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK5-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK5-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK5-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK5-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK5-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK5: .master: +// CHECK5-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK5-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK5-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK5-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK5-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK5-NEXT: [[TMP6:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP6]], i32 0) +// CHECK5-NEXT: [[TMP7:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS7]] to i8** +// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 0, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP7]], i32 0) +// CHECK5-NEXT: [[TMP8:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS8]] to i8** +// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** [[TMP8]], i32 0) +// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[A_ADDR]], align 4 +// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP9]], 1 +// CHECK5-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4 +// CHECK5-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK5: .termination.notifier: +// CHECK5-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK5-NEXT: br label [[DOTEXIT]] +// CHECK5: .exit: +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[A:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32 42, i32* [[A]], align 4 +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined___wrapper +// CHECK5-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK5-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK5-NEXT: call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[A:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32 43, i32* [[A]], align 4 +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper +// CHECK5-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK5-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK5-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[A:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32 44, i32* [[A]], align 4 +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__2_wrapper +// CHECK5-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK5-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK5-NEXT: call void @__omp_outlined__2(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_worker +// CHECK5-SAME: () #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK5-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK5-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK5-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK5: .await.work: +// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK5-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK5-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK5-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK5-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK5-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK5-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK5: .select.workers: +// CHECK5-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK5-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK5-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK5: .execute.parallel: +// CHECK5-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) // CHECK5-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 -// CHECK5-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*) +// CHECK5-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*) // CHECK5-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] // CHECK5: .execute.fn: -// CHECK5-NEXT: call void @__omp_outlined___wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3:[0-9]+]] +// CHECK5-NEXT: call void @__omp_outlined__3_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] // CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] // CHECK5: .check.next: -// CHECK5-NEXT: [[TMP6:%.*]] = load i8*, i8** [[WORK_FN]], align 4 -// CHECK5-NEXT: [[WORK_MATCH1:%.*]] = icmp eq i8* [[TMP6]], bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*) -// CHECK5-NEXT: br i1 [[WORK_MATCH1]], label [[DOTEXECUTE_FN2:%.*]], label [[DOTCHECK_NEXT3:%.*]] -// CHECK5: .execute.fn2: -// CHECK5-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] -// CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL]] -// CHECK5: .check.next3: -// CHECK5-NEXT: [[TMP7:%.*]] = load i8*, i8** [[WORK_FN]], align 4 -// CHECK5-NEXT: [[WORK_MATCH4:%.*]] = icmp eq i8* [[TMP7]], bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*) -// CHECK5-NEXT: br i1 [[WORK_MATCH4]], label [[DOTEXECUTE_FN5:%.*]], label [[DOTCHECK_NEXT6:%.*]] -// CHECK5: .execute.fn5: -// CHECK5-NEXT: call void @__omp_outlined__2_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] -// CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL]] -// CHECK5: .check.next6: -// CHECK5-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* -// CHECK5-NEXT: call void [[TMP8]](i16 0, i32 [[TMP4]]) +// CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK5-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) // CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL]] // CHECK5: .terminate.parallel: // CHECK5-NEXT: call void @__kmpc_kernel_end_parallel() @@ -1603,51 +745,63 @@ // CHECK5-NEXT: br label [[DOTAWAIT_WORK]] // CHECK5: .exit: // CHECK5-NEXT: ret void -// -// -// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29 -// CHECK5-SAME: (i32 [[A:%.*]]) #[[ATTR1:[0-9]+]] { +// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46 +// CHECK5-SAME: (i32 [[N:%.*]], i32 [[A:%.*]], i32 [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] { // CHECK5-NEXT: entry: +// CHECK5-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 // CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[AA_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 // CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4 -// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS7:%.*]] = alloca [0 x i8*], align 4 -// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS8:%.*]] = alloca [0 x i8*], align 4 +// CHECK5-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 // CHECK5-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[AA]], i32* [[AA_ADDR]], align 4 +// CHECK5-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK5-NEXT: [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16* +// CHECK5-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 // CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() // CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK5-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK5-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] -// CHECK5-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] -// CHECK5-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK5-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK5-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] // CHECK5: .worker: -// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29_worker() #[[ATTR3]] +// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_worker() #[[ATTR3]] // CHECK5-NEXT: br label [[DOTEXIT:%.*]] // CHECK5: .mastercheck: // CHECK5-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() // CHECK5-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK5-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK5-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 -// CHECK5-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 -// CHECK5-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 -// CHECK5-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] -// CHECK5-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] -// CHECK5-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK5-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK5-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK5-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1 +// CHECK5-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]] +// CHECK5-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK5-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] // CHECK5: .master: // CHECK5-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK5-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK5-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] // CHECK5-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) // CHECK5-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK5-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK5-NEXT: [[TMP6:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP6]], i32 0) -// CHECK5-NEXT: [[TMP7:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS7]] to i8** -// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 0, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP7]], i32 0) -// CHECK5-NEXT: [[TMP8:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS8]] to i8** -// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** [[TMP8]], i32 0) -// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[A_ADDR]], align 4 -// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP9]], 1 +// CHECK5-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK5-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP7]], 1000 +// CHECK5-NEXT: [[TMP8:%.*]] = zext i1 [[CMP]] to i32 +// CHECK5-NEXT: [[TMP9:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 [[TMP8]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** [[TMP9]], i32 0) +// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[A_ADDR]], align 4 +// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP10]], 1 // CHECK5-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4 +// CHECK5-NEXT: [[TMP11:%.*]] = load i16, i16* [[CONV]], align 4 +// CHECK5-NEXT: [[CONV7:%.*]] = sext i16 [[TMP11]] to i32 +// CHECK5-NEXT: [[ADD8:%.*]] = add nsw i32 [[CONV7]], 1 +// CHECK5-NEXT: [[CONV9:%.*]] = trunc i32 [[ADD8]] to i16 +// CHECK5-NEXT: store i16 [[CONV9]], i16* [[CONV]], align 4 +// CHECK5-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 2 +// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK5-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP12]], 1 +// CHECK5-NEXT: store i32 [[ADD10]], i32* [[ARRAYIDX]], align 4 // CHECK5-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] // CHECK5: .termination.notifier: // CHECK5-NEXT: call void @__kmpc_kernel_deinit(i16 1) @@ -1655,9 +809,7 @@ // CHECK5-NEXT: br label [[DOTEXIT]] // CHECK5: .exit: // CHECK5-NEXT: ret void -// -// -// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__3 // CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 @@ -1665,11 +817,12 @@ // CHECK5-NEXT: [[A:%.*]] = alloca i32, align 4 // CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 // CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32 42, i32* [[A]], align 4 +// CHECK5-NEXT: store i32 45, i32* [[A]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP1]]) // CHECK5-NEXT: ret void -// -// -// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined___wrapper +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper // CHECK5-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 @@ -1680,23 +833,142 @@ // CHECK5-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 // CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 // CHECK5-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -// CHECK5-NEXT: call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +// CHECK5-NEXT: call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58_worker +// CHECK5-SAME: () #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK5-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK5-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK5-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK5: .await.work: +// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK5-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK5-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK5-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK5-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK5-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK5-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK5: .select.workers: +// CHECK5-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK5-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK5-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK5: .execute.parallel: +// CHECK5-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK5-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK5-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*) +// CHECK5-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] +// CHECK5: .execute.fn: +// CHECK5-NEXT: call void @__omp_outlined__4_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] +// CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK5: .check.next: +// CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK5-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) +// CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK5: .terminate.parallel: +// CHECK5-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK5-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK5: .barrier.parallel: +// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK5-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK5: .exit: +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58 +// CHECK5-SAME: (i32 [[A:%.*]]) #[[ATTR1]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 +// CHECK5-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +// CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK5-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK5-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK5-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK5: .worker: +// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58_worker() #[[ATTR3]] +// CHECK5-NEXT: br label [[DOTEXIT:%.*]] +// CHECK5: .mastercheck: +// CHECK5-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK5-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK5-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK5-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK5-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK5-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK5-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK5: .master: +// CHECK5-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK5-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK5-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK5-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK5-NEXT: [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 4, i16 1) +// CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty* +// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[A_ADDR]], align 4 +// CHECK5-NEXT: [[A7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP6]], i32 0, i32 0 +// CHECK5-NEXT: store i32 [[TMP7]], i32* [[A7]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP10:%.*]] = bitcast i32* [[A7]] to i8* +// CHECK5-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK5-NEXT: [[TMP11:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP8]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__4 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*), i8** [[TMP11]], i32 1) +// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[A7]], align 4 +// CHECK5-NEXT: [[INC:%.*]] = add nsw i32 [[TMP12]], 1 +// CHECK5-NEXT: store i32 [[INC]], i32* [[A7]], align 4 +// CHECK5-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]]) +// CHECK5-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK5: .termination.notifier: +// CHECK5-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK5-NEXT: br label [[DOTEXIT]] +// CHECK5: .exit: // CHECK5-NEXT: ret void -// -// -// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR1]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[A:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[CRITICAL_COUNTER:%.*]] = alloca i32, align 4 // CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 // CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32 43, i32* [[A]], align 4 +// CHECK5-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK5-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_warp_active_thread_mask() +// CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: store i32 0, i32* [[CRITICAL_COUNTER]], align 4 +// CHECK5-NEXT: br label [[OMP_CRITICAL_LOOP:%.*]] +// CHECK5: omp.critical.loop: +// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = icmp slt i32 [[TMP2]], [[NVPTX_NUM_THREADS]] +// CHECK5-NEXT: br i1 [[TMP3]], label [[OMP_CRITICAL_TEST:%.*]], label [[OMP_CRITICAL_EXIT:%.*]] +// CHECK5: omp.critical.test: +// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4 +// CHECK5-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID]], [[TMP4]] +// CHECK5-NEXT: br i1 [[TMP5]], label [[OMP_CRITICAL_BODY:%.*]], label [[OMP_CRITICAL_SYNC:%.*]] +// CHECK5: omp.critical.body: +// CHECK5-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 +// CHECK5-NEXT: call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var") +// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK5-NEXT: [[INC:%.*]] = add nsw i32 [[TMP8]], 1 +// CHECK5-NEXT: store i32 [[INC]], i32* [[TMP0]], align 4 +// CHECK5-NEXT: call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var") +// CHECK5-NEXT: br label [[OMP_CRITICAL_SYNC]] +// CHECK5: omp.critical.sync: +// CHECK5-NEXT: call void @__kmpc_syncwarp(i32 [[TMP1]]) +// CHECK5-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP4]], 1 +// CHECK5-NEXT: store i32 [[TMP9]], i32* [[CRITICAL_COUNTER]], align 4 +// CHECK5-NEXT: br label [[OMP_CRITICAL_LOOP]] +// CHECK5: omp.critical.exit: // CHECK5-NEXT: ret void -// -// -// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__4_wrapper // CHECK5-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 @@ -1707,814 +979,1908 @@ // CHECK5-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 // CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 // CHECK5-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -// CHECK5-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +// CHECK5-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i32 0 +// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** +// CHECK5-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 4 +// CHECK5-NEXT: call void @__omp_outlined__4(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]] // CHECK5-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29_worker +// CHECK6-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK6-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK6-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK6-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK6: .await.work: +// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK6-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK6-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK6-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK6-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK6-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK6-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK6: .select.workers: +// CHECK6-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK6-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK6-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK6: .execute.parallel: +// CHECK6-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK6-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK6-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*) +// CHECK6-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] +// CHECK6: .execute.fn: +// CHECK6-NEXT: call void @__omp_outlined___wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3:[0-9]+]] +// CHECK6-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK6: .check.next: +// CHECK6-NEXT: [[TMP6:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK6-NEXT: [[WORK_MATCH1:%.*]] = icmp eq i8* [[TMP6]], bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*) +// CHECK6-NEXT: br i1 [[WORK_MATCH1]], label [[DOTEXECUTE_FN2:%.*]], label [[DOTCHECK_NEXT3:%.*]] +// CHECK6: .execute.fn2: +// CHECK6-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] +// CHECK6-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK6: .check.next3: +// CHECK6-NEXT: [[TMP7:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK6-NEXT: [[WORK_MATCH4:%.*]] = icmp eq i8* [[TMP7]], bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*) +// CHECK6-NEXT: br i1 [[WORK_MATCH4]], label [[DOTEXECUTE_FN5:%.*]], label [[DOTCHECK_NEXT6:%.*]] +// CHECK6: .execute.fn5: +// CHECK6-NEXT: call void @__omp_outlined__2_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] +// CHECK6-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK6: .check.next6: +// CHECK6-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK6-NEXT: call void [[TMP8]](i16 0, i32 [[TMP4]]) +// CHECK6-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK6: .terminate.parallel: +// CHECK6-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK6-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK6: .barrier.parallel: +// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK6-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK6: .exit: +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29 +// CHECK6-SAME: (i32 [[A:%.*]]) #[[ATTR1:[0-9]+]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4 +// CHECK6-NEXT: [[CAPTURED_VARS_ADDRS7:%.*]] = alloca [0 x i8*], align 4 +// CHECK6-NEXT: [[CAPTURED_VARS_ADDRS8:%.*]] = alloca [0 x i8*], align 4 +// CHECK6-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +// CHECK6-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK6-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK6-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK6-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK6: .worker: +// CHECK6-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29_worker() #[[ATTR3]] +// CHECK6-NEXT: br label [[DOTEXIT:%.*]] +// CHECK6: .mastercheck: +// CHECK6-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK6-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK6-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK6-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK6-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK6-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK6-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK6-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK6: .master: +// CHECK6-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK6-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK6-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK6-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK6-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK6-NEXT: [[TMP6:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP6]], i32 0) +// CHECK6-NEXT: [[TMP7:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS7]] to i8** +// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 0, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP7]], i32 0) +// CHECK6-NEXT: [[TMP8:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS8]] to i8** +// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** [[TMP8]], i32 0) +// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[A_ADDR]], align 4 +// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP9]], 1 +// CHECK6-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4 +// CHECK6-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK6: .termination.notifier: +// CHECK6-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK6-NEXT: br label [[DOTEXIT]] +// CHECK6: .exit: +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[A:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32 42, i32* [[A]], align 4 +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined___wrapper +// CHECK6-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK6-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK6-NEXT: call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[A:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32 43, i32* [[A]], align 4 +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper +// CHECK6-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK6-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK6-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[A:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32 44, i32* [[A]], align 4 +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__2_wrapper +// CHECK6-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK6-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK6-NEXT: call void @__omp_outlined__2(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_worker +// CHECK6-SAME: () #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK6-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK6-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK6-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK6: .await.work: +// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK6-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK6-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK6-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK6-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK6-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK6-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK6: .select.workers: +// CHECK6-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK6-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK6-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK6: .execute.parallel: +// CHECK6-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK6-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK6-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*) +// CHECK6-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] +// CHECK6: .execute.fn: +// CHECK6-NEXT: call void @__omp_outlined__3_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] +// CHECK6-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK6: .check.next: +// CHECK6-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK6-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) +// CHECK6-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK6: .terminate.parallel: +// CHECK6-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK6-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK6: .barrier.parallel: +// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK6-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK6: .exit: +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46 +// CHECK6-SAME: (i32 [[N:%.*]], i32 [[A:%.*]], i32 [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[AA_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK6-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4 +// CHECK6-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[AA]], i32* [[AA_ADDR]], align 4 +// CHECK6-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK6-NEXT: [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16* +// CHECK6-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK6-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK6-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK6-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK6-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK6: .worker: +// CHECK6-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_worker() #[[ATTR3]] +// CHECK6-NEXT: br label [[DOTEXIT:%.*]] +// CHECK6: .mastercheck: +// CHECK6-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK6-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK6-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK6-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK6-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1 +// CHECK6-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]] +// CHECK6-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK6-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK6: .master: +// CHECK6-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK6-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK6-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK6-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK6-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK6-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP7]], 1000 +// CHECK6-NEXT: [[TMP8:%.*]] = zext i1 [[CMP]] to i32 +// CHECK6-NEXT: [[TMP9:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 [[TMP8]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** [[TMP9]], i32 0) +// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[A_ADDR]], align 4 +// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK6-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4 +// CHECK6-NEXT: [[TMP11:%.*]] = load i16, i16* [[CONV]], align 4 +// CHECK6-NEXT: [[CONV7:%.*]] = sext i16 [[TMP11]] to i32 +// CHECK6-NEXT: [[ADD8:%.*]] = add nsw i32 [[CONV7]], 1 +// CHECK6-NEXT: [[CONV9:%.*]] = trunc i32 [[ADD8]] to i16 +// CHECK6-NEXT: store i16 [[CONV9]], i16* [[CONV]], align 4 +// CHECK6-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 2 +// CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK6-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP12]], 1 +// CHECK6-NEXT: store i32 [[ADD10]], i32* [[ARRAYIDX]], align 4 +// CHECK6-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK6: .termination.notifier: +// CHECK6-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK6-NEXT: br label [[DOTEXIT]] +// CHECK6: .exit: +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[A:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32 45, i32* [[A]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP1]]) +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper +// CHECK6-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK6-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK6-NEXT: call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58_worker +// CHECK6-SAME: () #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK6-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK6-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK6-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK6: .await.work: +// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK6-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK6-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK6-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK6-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK6-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK6-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK6: .select.workers: +// CHECK6-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK6-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK6-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK6: .execute.parallel: +// CHECK6-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK6-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK6-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*) +// CHECK6-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] +// CHECK6: .execute.fn: +// CHECK6-NEXT: call void @__omp_outlined__4_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] +// CHECK6-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK6: .check.next: +// CHECK6-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK6-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) +// CHECK6-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK6: .terminate.parallel: +// CHECK6-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK6-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK6: .barrier.parallel: +// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK6-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK6: .exit: +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58 +// CHECK6-SAME: (i32 [[A:%.*]]) #[[ATTR1]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 +// CHECK6-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +// CHECK6-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK6-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK6-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK6-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK6: .worker: +// CHECK6-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58_worker() #[[ATTR3]] +// CHECK6-NEXT: br label [[DOTEXIT:%.*]] +// CHECK6: .mastercheck: +// CHECK6-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK6-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK6-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK6-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK6-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK6-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK6-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK6-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK6: .master: +// CHECK6-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK6-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK6-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK6-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK6-NEXT: [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 4, i16 1) +// CHECK6-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty* +// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[A_ADDR]], align 4 +// CHECK6-NEXT: [[A7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP6]], i32 0, i32 0 +// CHECK6-NEXT: store i32 [[TMP7]], i32* [[A7]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP10:%.*]] = bitcast i32* [[A7]] to i8* +// CHECK6-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK6-NEXT: [[TMP11:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP8]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__4 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*), i8** [[TMP11]], i32 1) +// CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[A7]], align 4 +// CHECK6-NEXT: [[INC:%.*]] = add nsw i32 [[TMP12]], 1 +// CHECK6-NEXT: store i32 [[INC]], i32* [[A7]], align 4 +// CHECK6-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]]) +// CHECK6-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK6: .termination.notifier: +// CHECK6-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK6-NEXT: br label [[DOTEXIT]] +// CHECK6: .exit: +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR1]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[CRITICAL_COUNTER:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK6-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_warp_active_thread_mask() +// CHECK6-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: store i32 0, i32* [[CRITICAL_COUNTER]], align 4 +// CHECK6-NEXT: br label [[OMP_CRITICAL_LOOP:%.*]] +// CHECK6: omp.critical.loop: +// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = icmp slt i32 [[TMP2]], [[NVPTX_NUM_THREADS]] +// CHECK6-NEXT: br i1 [[TMP3]], label [[OMP_CRITICAL_TEST:%.*]], label [[OMP_CRITICAL_EXIT:%.*]] +// CHECK6: omp.critical.test: +// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4 +// CHECK6-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID]], [[TMP4]] +// CHECK6-NEXT: br i1 [[TMP5]], label [[OMP_CRITICAL_BODY:%.*]], label [[OMP_CRITICAL_SYNC:%.*]] +// CHECK6: omp.critical.body: +// CHECK6-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 +// CHECK6-NEXT: call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var") +// CHECK6-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK6-NEXT: [[INC:%.*]] = add nsw i32 [[TMP8]], 1 +// CHECK6-NEXT: store i32 [[INC]], i32* [[TMP0]], align 4 +// CHECK6-NEXT: call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var") +// CHECK6-NEXT: br label [[OMP_CRITICAL_SYNC]] +// CHECK6: omp.critical.sync: +// CHECK6-NEXT: call void @__kmpc_syncwarp(i32 [[TMP1]]) +// CHECK6-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP4]], 1 +// CHECK6-NEXT: store i32 [[TMP9]], i32* [[CRITICAL_COUNTER]], align 4 +// CHECK6-NEXT: br label [[OMP_CRITICAL_LOOP]] +// CHECK6: omp.critical.exit: +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__4_wrapper +// CHECK6-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK6-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK6-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i32 0 +// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** +// CHECK6-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 4 +// CHECK6-NEXT: call void @__omp_outlined__4(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]] +// CHECK6-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l27_worker +// CHECK1-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK1-NEXT: store i8* null, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK1: .await.work: +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK1-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK1-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK1-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK1: .select.workers: +// CHECK1-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK1-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK1: .execute.parallel: +// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*) +// CHECK1-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] +// CHECK1: .execute.fn: +// CHECK1-NEXT: call void @__omp_outlined___wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3:[0-9]+]] +// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK1: .check.next: +// CHECK1-NEXT: [[TMP6:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: [[WORK_MATCH1:%.*]] = icmp eq i8* [[TMP6]], bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*) +// CHECK1-NEXT: br i1 [[WORK_MATCH1]], label [[DOTEXECUTE_FN2:%.*]], label [[DOTCHECK_NEXT3:%.*]] +// CHECK1: .execute.fn2: +// CHECK1-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] +// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK1: .check.next3: +// CHECK1-NEXT: [[TMP7:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: [[WORK_MATCH4:%.*]] = icmp eq i8* [[TMP7]], bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*) +// CHECK1-NEXT: br i1 [[WORK_MATCH4]], label [[DOTEXECUTE_FN5:%.*]], label [[DOTCHECK_NEXT6:%.*]] +// CHECK1: .execute.fn5: +// CHECK1-NEXT: call void @__omp_outlined__2_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] +// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK1: .check.next6: +// CHECK1-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK1-NEXT: call void [[TMP8]](i16 0, i32 [[TMP4]]) +// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK1: .terminate.parallel: +// CHECK1-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK1-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK1: .barrier.parallel: +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__2 -// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[A:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32 44, i32* [[A]], align 4 -// CHECK5-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l27 +// CHECK1-SAME: (i64 [[A:%.*]]) #[[ATTR1:[0-9]+]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS7:%.*]] = alloca [0 x i8*], align 8 +// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS8:%.*]] = alloca [0 x i8*], align 8 +// CHECK1-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32* +// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK1-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK1-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK1: .worker: +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l27_worker() #[[ATTR3]] +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .mastercheck: +// CHECK1-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK1-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK1-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK1-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK1-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK1-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK1: .master: +// CHECK1-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK1-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK1-NEXT: [[TMP6:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP6]], i64 0) +// CHECK1-NEXT: [[TMP7:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS7]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 0, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP7]], i64 0) +// CHECK1-NEXT: [[TMP8:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS8]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** [[TMP8]], i64 0) +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP9]], 1 +// CHECK1-NEXT: store i32 [[ADD]], i32* [[CONV]], align 8 +// CHECK1-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK1: .termination.notifier: +// CHECK1-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: br label [[DOTEXIT]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__2_wrapper -// CHECK5-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 -// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK5-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -// CHECK5-NEXT: call void @__omp_outlined__2(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] -// CHECK5-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[A:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32 42, i32* [[A]], align 4 +// CHECK1-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_worker -// CHECK5-SAME: () #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 -// CHECK5-NEXT: store i8* null, i8** [[WORK_FN]], align 4 -// CHECK5-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 -// CHECK5-NEXT: br label [[DOTAWAIT_WORK:%.*]] -// CHECK5: .await.work: -// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK5-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) -// CHECK5-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 -// CHECK5-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 -// CHECK5-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 -// CHECK5-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null -// CHECK5-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] -// CHECK5: .select.workers: -// CHECK5-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 -// CHECK5-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK5-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] -// CHECK5: .execute.parallel: -// CHECK5-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK5-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 -// CHECK5-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*) -// CHECK5-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] -// CHECK5: .execute.fn: -// CHECK5-NEXT: call void @__omp_outlined__3_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] -// CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] -// CHECK5: .check.next: -// CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* -// CHECK5-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) -// CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL]] -// CHECK5: .terminate.parallel: -// CHECK5-NEXT: call void @__kmpc_kernel_end_parallel() -// CHECK5-NEXT: br label [[DOTBARRIER_PARALLEL]] -// CHECK5: .barrier.parallel: -// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK5-NEXT: br label [[DOTAWAIT_WORK]] -// CHECK5: .exit: -// CHECK5-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined___wrapper +// CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK1-NEXT: call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +// CHECK1-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46 -// CHECK5-SAME: (i32 [[N:%.*]], i32 [[A:%.*]], i32 [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[AA_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4 -// CHECK5-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[AA]], i32* [[AA_ADDR]], align 4 -// CHECK5-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK5-NEXT: [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16* -// CHECK5-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK5-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK5-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] -// CHECK5-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] -// CHECK5-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] -// CHECK5: .worker: -// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_worker() #[[ATTR3]] -// CHECK5-NEXT: br label [[DOTEXIT:%.*]] -// CHECK5: .mastercheck: -// CHECK5-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK5-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK5-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK5-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 -// CHECK5-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 -// CHECK5-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1 -// CHECK5-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]] -// CHECK5-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] -// CHECK5-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] -// CHECK5: .master: -// CHECK5-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK5-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK5-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] -// CHECK5-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK5-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK5-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK5-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP7]], 1000 -// CHECK5-NEXT: [[TMP8:%.*]] = zext i1 [[CMP]] to i32 -// CHECK5-NEXT: [[TMP9:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 [[TMP8]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** [[TMP9]], i32 0) -// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[A_ADDR]], align 4 -// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP10]], 1 -// CHECK5-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4 -// CHECK5-NEXT: [[TMP11:%.*]] = load i16, i16* [[CONV]], align 4 -// CHECK5-NEXT: [[CONV7:%.*]] = sext i16 [[TMP11]] to i32 -// CHECK5-NEXT: [[ADD8:%.*]] = add nsw i32 [[CONV7]], 1 -// CHECK5-NEXT: [[CONV9:%.*]] = trunc i32 [[ADD8]] to i16 -// CHECK5-NEXT: store i16 [[CONV9]], i16* [[CONV]], align 4 -// CHECK5-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 2 -// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -// CHECK5-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP12]], 1 -// CHECK5-NEXT: store i32 [[ADD10]], i32* [[ARRAYIDX]], align 4 -// CHECK5-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] -// CHECK5: .termination.notifier: -// CHECK5-NEXT: call void @__kmpc_kernel_deinit(i16 1) -// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK5-NEXT: br label [[DOTEXIT]] -// CHECK5: .exit: -// CHECK5-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[A:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32 43, i32* [[A]], align 4 +// CHECK1-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__3 -// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[A:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32 45, i32* [[A]], align 4 -// CHECK5-NEXT: [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 -// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP1]]) -// CHECK5-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper +// CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK1-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +// CHECK1-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper -// CHECK5-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 -// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK5-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -// CHECK5-NEXT: call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] -// CHECK5-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[A:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32 44, i32* [[A]], align 4 +// CHECK1-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58_worker -// CHECK5-SAME: () #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 -// CHECK5-NEXT: store i8* null, i8** [[WORK_FN]], align 4 -// CHECK5-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 -// CHECK5-NEXT: br label [[DOTAWAIT_WORK:%.*]] -// CHECK5: .await.work: -// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK5-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) -// CHECK5-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 -// CHECK5-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 -// CHECK5-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 -// CHECK5-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null -// CHECK5-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] -// CHECK5: .select.workers: -// CHECK5-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 -// CHECK5-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK5-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] -// CHECK5: .execute.parallel: -// CHECK5-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK5-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 -// CHECK5-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*) -// CHECK5-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] -// CHECK5: .execute.fn: -// CHECK5-NEXT: call void @__omp_outlined__4_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] -// CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] -// CHECK5: .check.next: -// CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* -// CHECK5-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) -// CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL]] -// CHECK5: .terminate.parallel: -// CHECK5-NEXT: call void @__kmpc_kernel_end_parallel() -// CHECK5-NEXT: br label [[DOTBARRIER_PARALLEL]] -// CHECK5: .barrier.parallel: -// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK5-NEXT: br label [[DOTAWAIT_WORK]] -// CHECK5: .exit: -// CHECK5-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__2_wrapper +// CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK1-NEXT: call void @__omp_outlined__2(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +// CHECK1-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58 -// CHECK5-SAME: (i32 [[A:%.*]]) #[[ATTR1]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 -// CHECK5-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 -// CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK5-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK5-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] -// CHECK5-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] -// CHECK5-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] -// CHECK5: .worker: -// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58_worker() #[[ATTR3]] -// CHECK5-NEXT: br label [[DOTEXIT:%.*]] -// CHECK5: .mastercheck: -// CHECK5-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK5-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK5-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK5-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 -// CHECK5-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 -// CHECK5-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 -// CHECK5-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] -// CHECK5-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] -// CHECK5-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] -// CHECK5: .master: -// CHECK5-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK5-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK5-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] -// CHECK5-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK5-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK5-NEXT: [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 4, i16 1) -// CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty* -// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[A_ADDR]], align 4 -// CHECK5-NEXT: [[A7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP6]], i32 0, i32 0 -// CHECK5-NEXT: store i32 [[TMP7]], i32* [[A7]], align 4 -// CHECK5-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP10:%.*]] = bitcast i32* [[A7]] to i8* -// CHECK5-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 -// CHECK5-NEXT: [[TMP11:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP8]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__4 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*), i8** [[TMP11]], i32 1) -// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[A7]], align 4 -// CHECK5-NEXT: [[INC:%.*]] = add nsw i32 [[TMP12]], 1 -// CHECK5-NEXT: store i32 [[INC]], i32* [[A7]], align 4 -// CHECK5-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]]) -// CHECK5-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] -// CHECK5: .termination.notifier: -// CHECK5-NEXT: call void @__kmpc_kernel_deinit(i16 1) -// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK5-NEXT: br label [[DOTEXIT]] -// CHECK5: .exit: -// CHECK5-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44_worker +// CHECK1-SAME: () #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK1-NEXT: store i8* null, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK1: .await.work: +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK1-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK1-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK1-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK1: .select.workers: +// CHECK1-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK1-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK1: .execute.parallel: +// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*) +// CHECK1-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] +// CHECK1: .execute.fn: +// CHECK1-NEXT: call void @__omp_outlined__3_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] +// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK1: .check.next: +// CHECK1-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK1-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) +// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK1: .terminate.parallel: +// CHECK1-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK1-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK1: .barrier.parallel: +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44 +// CHECK1-SAME: (i64 [[N:%.*]], i64 [[A:%.*]], i64 [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[AA_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[AA]], i64* [[AA_ADDR]], align 8 +// CHECK1-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[A_ADDR]] to i32* +// CHECK1-NEXT: [[CONV2:%.*]] = bitcast i64* [[AA_ADDR]] to i16* +// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK1-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK1-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK1: .worker: +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44_worker() #[[ATTR3]] +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .mastercheck: +// CHECK1-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE5]], 1 +// CHECK1-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], 1 +// CHECK1-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1 +// CHECK1-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]] +// CHECK1-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID3]], [[MASTER_TID]] +// CHECK1-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK1: .master: +// CHECK1-NEXT: [[NVPTX_NUM_THREADS6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[THREAD_LIMIT8:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS6]], [[NVPTX_WARP_SIZE7]] +// CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT8]], i16 1) +// CHECK1-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP7]], 1000 +// CHECK1-NEXT: [[TMP8:%.*]] = zext i1 [[CMP]] to i32 +// CHECK1-NEXT: [[TMP9:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 [[TMP8]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** [[TMP9]], i64 0) +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK1-NEXT: store i32 [[ADD]], i32* [[CONV1]], align 8 +// CHECK1-NEXT: [[TMP11:%.*]] = load i16, i16* [[CONV2]], align 8 +// CHECK1-NEXT: [[CONV9:%.*]] = sext i16 [[TMP11]] to i32 +// CHECK1-NEXT: [[ADD10:%.*]] = add nsw i32 [[CONV9]], 1 +// CHECK1-NEXT: [[CONV11:%.*]] = trunc i32 [[ADD10]] to i16 +// CHECK1-NEXT: store i16 [[CONV11]], i16* [[CONV2]], align 8 +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i64 0, i64 2 +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK1-NEXT: [[ADD12:%.*]] = add nsw i32 [[TMP12]], 1 +// CHECK1-NEXT: store i32 [[ADD12]], i32* [[ARRAYIDX]], align 4 +// CHECK1-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK1: .termination.notifier: +// CHECK1-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: br label [[DOTEXIT]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__4 -// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR1]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[CRITICAL_COUNTER:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 -// CHECK5-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 -// CHECK5-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_warp_active_thread_mask() -// CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK5-NEXT: store i32 0, i32* [[CRITICAL_COUNTER]], align 4 -// CHECK5-NEXT: br label [[OMP_CRITICAL_LOOP:%.*]] -// CHECK5: omp.critical.loop: -// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4 -// CHECK5-NEXT: [[TMP3:%.*]] = icmp slt i32 [[TMP2]], [[NVPTX_NUM_THREADS]] -// CHECK5-NEXT: br i1 [[TMP3]], label [[OMP_CRITICAL_TEST:%.*]], label [[OMP_CRITICAL_EXIT:%.*]] -// CHECK5: omp.critical.test: -// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4 -// CHECK5-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID]], [[TMP4]] -// CHECK5-NEXT: br i1 [[TMP5]], label [[OMP_CRITICAL_BODY:%.*]], label [[OMP_CRITICAL_SYNC:%.*]] -// CHECK5: omp.critical.body: -// CHECK5-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 -// CHECK5-NEXT: call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var") -// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP0]], align 4 -// CHECK5-NEXT: [[INC:%.*]] = add nsw i32 [[TMP8]], 1 -// CHECK5-NEXT: store i32 [[INC]], i32* [[TMP0]], align 4 -// CHECK5-NEXT: call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var") -// CHECK5-NEXT: br label [[OMP_CRITICAL_SYNC]] -// CHECK5: omp.critical.sync: -// CHECK5-NEXT: call void @__kmpc_syncwarp(i32 [[TMP1]]) -// CHECK5-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP4]], 1 -// CHECK5-NEXT: store i32 [[TMP9]], i32* [[CRITICAL_COUNTER]], align 4 -// CHECK5-NEXT: br label [[OMP_CRITICAL_LOOP]] -// CHECK5: omp.critical.exit: -// CHECK5-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[A:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32 45, i32* [[A]], align 4 +// CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP1]]) +// CHECK1-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__4_wrapper -// CHECK5-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 -// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK5-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -// CHECK5-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 4 -// CHECK5-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i32 0 -// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** -// CHECK5-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 4 -// CHECK5-NEXT: call void @__omp_outlined__4(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]] -// CHECK5-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper +// CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK1-NEXT: call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +// CHECK1-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29_worker -// CHECK6-SAME: () #[[ATTR0:[0-9]+]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 -// CHECK6-NEXT: store i8* null, i8** [[WORK_FN]], align 4 -// CHECK6-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 -// CHECK6-NEXT: br label [[DOTAWAIT_WORK:%.*]] -// CHECK6: .await.work: -// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK6-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) -// CHECK6-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 -// CHECK6-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 -// CHECK6-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 -// CHECK6-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null -// CHECK6-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] -// CHECK6: .select.workers: -// CHECK6-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 -// CHECK6-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK6-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] -// CHECK6: .execute.parallel: -// CHECK6-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) -// CHECK6-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 -// CHECK6-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*) -// CHECK6-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] -// CHECK6: .execute.fn: -// CHECK6-NEXT: call void @__omp_outlined___wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3:[0-9]+]] -// CHECK6-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] -// CHECK6: .check.next: -// CHECK6-NEXT: [[TMP6:%.*]] = load i8*, i8** [[WORK_FN]], align 4 -// CHECK6-NEXT: [[WORK_MATCH1:%.*]] = icmp eq i8* [[TMP6]], bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*) -// CHECK6-NEXT: br i1 [[WORK_MATCH1]], label [[DOTEXECUTE_FN2:%.*]], label [[DOTCHECK_NEXT3:%.*]] -// CHECK6: .execute.fn2: -// CHECK6-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] -// CHECK6-NEXT: br label [[DOTTERMINATE_PARALLEL]] -// CHECK6: .check.next3: -// CHECK6-NEXT: [[TMP7:%.*]] = load i8*, i8** [[WORK_FN]], align 4 -// CHECK6-NEXT: [[WORK_MATCH4:%.*]] = icmp eq i8* [[TMP7]], bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*) -// CHECK6-NEXT: br i1 [[WORK_MATCH4]], label [[DOTEXECUTE_FN5:%.*]], label [[DOTCHECK_NEXT6:%.*]] -// CHECK6: .execute.fn5: -// CHECK6-NEXT: call void @__omp_outlined__2_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] -// CHECK6-NEXT: br label [[DOTTERMINATE_PARALLEL]] -// CHECK6: .check.next6: -// CHECK6-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* -// CHECK6-NEXT: call void [[TMP8]](i16 0, i32 [[TMP4]]) -// CHECK6-NEXT: br label [[DOTTERMINATE_PARALLEL]] -// CHECK6: .terminate.parallel: -// CHECK6-NEXT: call void @__kmpc_kernel_end_parallel() -// CHECK6-NEXT: br label [[DOTBARRIER_PARALLEL]] -// CHECK6: .barrier.parallel: -// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK6-NEXT: br label [[DOTAWAIT_WORK]] -// CHECK6: .exit: -// CHECK6-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l56_worker +// CHECK1-SAME: () #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK1-NEXT: store i8* null, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK1: .await.work: +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK1-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK1-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK1-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK1: .select.workers: +// CHECK1-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK1-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK1: .execute.parallel: +// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*) +// CHECK1-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] +// CHECK1: .execute.fn: +// CHECK1-NEXT: call void @__omp_outlined__4_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] +// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK1: .check.next: +// CHECK1-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK1-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) +// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK1: .terminate.parallel: +// CHECK1-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK1-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK1: .barrier.parallel: +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l56 +// CHECK1-SAME: (i64 [[A:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8 +// CHECK1-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32* +// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK1-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK1-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK1: .worker: +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l56_worker() #[[ATTR3]] +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .mastercheck: +// CHECK1-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK1-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK1-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK1-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK1-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK1-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK1: .master: +// CHECK1-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[A7:%.*]] = call i8* @__kmpc_alloc_shared(i64 4) +// CHECK1-NEXT: [[A_ON_STACK:%.*]] = bitcast i8* [[A7]] to i32* +// CHECK1-NEXT: store i32 [[TMP5]], i32* [[A_ON_STACK]], align 4 +// CHECK1-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP8:%.*]] = bitcast i32* [[A_ON_STACK]] to i8* +// CHECK1-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 8 +// CHECK1-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__4 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*), i8** [[TMP9]], i64 1) +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[A_ON_STACK]], align 4 +// CHECK1-NEXT: [[INC:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK1-NEXT: store i32 [[INC]], i32* [[A_ON_STACK]], align 4 +// CHECK1-NEXT: call void @__kmpc_free_shared(i8* [[A7]]) +// CHECK1-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK1: .termination.notifier: +// CHECK1-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: br label [[DOTEXIT]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[CRITICAL_COUNTER:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_warp_active_thread_mask() +// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: store i32 0, i32* [[CRITICAL_COUNTER]], align 4 +// CHECK1-NEXT: br label [[OMP_CRITICAL_LOOP:%.*]] +// CHECK1: omp.critical.loop: +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = icmp slt i32 [[TMP2]], [[NVPTX_NUM_THREADS]] +// CHECK1-NEXT: br i1 [[TMP3]], label [[OMP_CRITICAL_TEST:%.*]], label [[OMP_CRITICAL_EXIT:%.*]] +// CHECK1: omp.critical.test: +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4 +// CHECK1-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID]], [[TMP4]] +// CHECK1-NEXT: br i1 [[TMP5]], label [[OMP_CRITICAL_BODY:%.*]], label [[OMP_CRITICAL_SYNC:%.*]] +// CHECK1: omp.critical.body: +// CHECK1-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 +// CHECK1-NEXT: call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var") +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK1-NEXT: [[INC:%.*]] = add nsw i32 [[TMP8]], 1 +// CHECK1-NEXT: store i32 [[INC]], i32* [[TMP0]], align 4 +// CHECK1-NEXT: call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var") +// CHECK1-NEXT: br label [[OMP_CRITICAL_SYNC]] +// CHECK1: omp.critical.sync: +// CHECK1-NEXT: call void @__kmpc_syncwarp(i32 [[TMP1]]) +// CHECK1-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP4]], 1 +// CHECK1-NEXT: store i32 [[TMP9]], i32* [[CRITICAL_COUNTER]], align 4 +// CHECK1-NEXT: br label [[OMP_CRITICAL_LOOP]] +// CHECK1: omp.critical.exit: +// CHECK1-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29 -// CHECK6-SAME: (i32 [[A:%.*]]) #[[ATTR1:[0-9]+]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4 -// CHECK6-NEXT: [[CAPTURED_VARS_ADDRS7:%.*]] = alloca [0 x i8*], align 4 -// CHECK6-NEXT: [[CAPTURED_VARS_ADDRS8:%.*]] = alloca [0 x i8*], align 4 -// CHECK6-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 -// CHECK6-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK6-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK6-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] -// CHECK6-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] -// CHECK6-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] -// CHECK6: .worker: -// CHECK6-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29_worker() #[[ATTR3]] -// CHECK6-NEXT: br label [[DOTEXIT:%.*]] -// CHECK6: .mastercheck: -// CHECK6-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK6-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK6-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK6-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 -// CHECK6-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 -// CHECK6-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 -// CHECK6-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] -// CHECK6-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] -// CHECK6-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] -// CHECK6: .master: -// CHECK6-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK6-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK6-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] -// CHECK6-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK6-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK6-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK6-NEXT: [[TMP6:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP6]], i32 0) -// CHECK6-NEXT: [[TMP7:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS7]] to i8** -// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 0, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP7]], i32 0) -// CHECK6-NEXT: [[TMP8:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS8]] to i8** -// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** [[TMP8]], i32 0) -// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[A_ADDR]], align 4 -// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP9]], 1 -// CHECK6-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4 -// CHECK6-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] -// CHECK6: .termination.notifier: -// CHECK6-NEXT: call void @__kmpc_kernel_deinit(i16 1) -// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK6-NEXT: br label [[DOTEXIT]] -// CHECK6: .exit: -// CHECK6-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__4_wrapper +// CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK1-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 0 +// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** +// CHECK1-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 8 +// CHECK1-NEXT: call void @__omp_outlined__4(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]] +// CHECK1-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[A:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK6-NEXT: store i32 42, i32* [[A]], align 4 -// CHECK6-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l27_worker +// CHECK2-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK2-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK2-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK2: .await.work: +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK2-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK2-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK2-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK2-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK2: .select.workers: +// CHECK2-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK2-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK2: .execute.parallel: +// CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK2-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK2-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*) +// CHECK2-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] +// CHECK2: .execute.fn: +// CHECK2-NEXT: call void @__omp_outlined___wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3:[0-9]+]] +// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK2: .check.next: +// CHECK2-NEXT: [[TMP6:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK2-NEXT: [[WORK_MATCH1:%.*]] = icmp eq i8* [[TMP6]], bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*) +// CHECK2-NEXT: br i1 [[WORK_MATCH1]], label [[DOTEXECUTE_FN2:%.*]], label [[DOTCHECK_NEXT3:%.*]] +// CHECK2: .execute.fn2: +// CHECK2-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] +// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK2: .check.next3: +// CHECK2-NEXT: [[TMP7:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK2-NEXT: [[WORK_MATCH4:%.*]] = icmp eq i8* [[TMP7]], bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*) +// CHECK2-NEXT: br i1 [[WORK_MATCH4]], label [[DOTEXECUTE_FN5:%.*]], label [[DOTCHECK_NEXT6:%.*]] +// CHECK2: .execute.fn5: +// CHECK2-NEXT: call void @__omp_outlined__2_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] +// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK2: .check.next6: +// CHECK2-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK2-NEXT: call void [[TMP8]](i16 0, i32 [[TMP4]]) +// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK2: .terminate.parallel: +// CHECK2-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK2-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK2: .barrier.parallel: +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined___wrapper -// CHECK6-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 -// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK6-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -// CHECK6-NEXT: call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] -// CHECK6-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l27 +// CHECK2-SAME: (i32 [[A:%.*]]) #[[ATTR1:[0-9]+]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4 +// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS7:%.*]] = alloca [0 x i8*], align 4 +// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS8:%.*]] = alloca [0 x i8*], align 4 +// CHECK2-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK2-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK2-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK2: .worker: +// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l27_worker() #[[ATTR3]] +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .mastercheck: +// CHECK2-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK2-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK2-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK2-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK2-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK2-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK2: .master: +// CHECK2-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK2-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK2-NEXT: [[TMP6:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP6]], i32 0) +// CHECK2-NEXT: [[TMP7:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS7]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 0, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP7]], i32 0) +// CHECK2-NEXT: [[TMP8:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS8]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** [[TMP8]], i32 0) +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[A_ADDR]], align 4 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP9]], 1 +// CHECK2-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4 +// CHECK2-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK2: .termination.notifier: +// CHECK2-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: br label [[DOTEXIT]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[A:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK6-NEXT: store i32 43, i32* [[A]], align 4 -// CHECK6-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[A:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32 42, i32* [[A]], align 4 +// CHECK2-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper -// CHECK6-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 -// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK6-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -// CHECK6-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] -// CHECK6-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined___wrapper +// CHECK2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK2-NEXT: call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +// CHECK2-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__2 -// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[A:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK6-NEXT: store i32 44, i32* [[A]], align 4 -// CHECK6-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[A:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32 43, i32* [[A]], align 4 +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper +// CHECK2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK2-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[A:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32 44, i32* [[A]], align 4 +// CHECK2-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__2_wrapper -// CHECK6-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 -// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK6-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -// CHECK6-NEXT: call void @__omp_outlined__2(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] -// CHECK6-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__2_wrapper +// CHECK2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK2-NEXT: call void @__omp_outlined__2(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +// CHECK2-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_worker -// CHECK6-SAME: () #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 -// CHECK6-NEXT: store i8* null, i8** [[WORK_FN]], align 4 -// CHECK6-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 -// CHECK6-NEXT: br label [[DOTAWAIT_WORK:%.*]] -// CHECK6: .await.work: -// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK6-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) -// CHECK6-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 -// CHECK6-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 -// CHECK6-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 -// CHECK6-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null -// CHECK6-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] -// CHECK6: .select.workers: -// CHECK6-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 -// CHECK6-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK6-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] -// CHECK6: .execute.parallel: -// CHECK6-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK6-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 -// CHECK6-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*) -// CHECK6-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] -// CHECK6: .execute.fn: -// CHECK6-NEXT: call void @__omp_outlined__3_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] -// CHECK6-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] -// CHECK6: .check.next: -// CHECK6-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* -// CHECK6-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) -// CHECK6-NEXT: br label [[DOTTERMINATE_PARALLEL]] -// CHECK6: .terminate.parallel: -// CHECK6-NEXT: call void @__kmpc_kernel_end_parallel() -// CHECK6-NEXT: br label [[DOTBARRIER_PARALLEL]] -// CHECK6: .barrier.parallel: -// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK6-NEXT: br label [[DOTAWAIT_WORK]] -// CHECK6: .exit: -// CHECK6-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44_worker +// CHECK2-SAME: () #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK2-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK2-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK2: .await.work: +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK2-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK2-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK2-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK2-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK2: .select.workers: +// CHECK2-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK2-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK2: .execute.parallel: +// CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK2-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK2-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*) +// CHECK2-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] +// CHECK2: .execute.fn: +// CHECK2-NEXT: call void @__omp_outlined__3_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] +// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK2: .check.next: +// CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK2-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) +// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK2: .terminate.parallel: +// CHECK2-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK2-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK2: .barrier.parallel: +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46 -// CHECK6-SAME: (i32 [[N:%.*]], i32 [[A:%.*]], i32 [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[AA_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK6-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4 -// CHECK6-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[AA]], i32* [[AA_ADDR]], align 4 -// CHECK6-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK6-NEXT: [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16* -// CHECK6-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK6-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK6-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK6-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] -// CHECK6-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] -// CHECK6-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] -// CHECK6: .worker: -// CHECK6-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_worker() #[[ATTR3]] -// CHECK6-NEXT: br label [[DOTEXIT:%.*]] -// CHECK6: .mastercheck: -// CHECK6-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK6-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK6-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK6-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 -// CHECK6-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 -// CHECK6-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1 -// CHECK6-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]] -// CHECK6-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] -// CHECK6-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] -// CHECK6: .master: -// CHECK6-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK6-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK6-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] -// CHECK6-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK6-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK6-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK6-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP7]], 1000 -// CHECK6-NEXT: [[TMP8:%.*]] = zext i1 [[CMP]] to i32 -// CHECK6-NEXT: [[TMP9:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 [[TMP8]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** [[TMP9]], i32 0) -// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[A_ADDR]], align 4 -// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP10]], 1 -// CHECK6-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4 -// CHECK6-NEXT: [[TMP11:%.*]] = load i16, i16* [[CONV]], align 4 -// CHECK6-NEXT: [[CONV7:%.*]] = sext i16 [[TMP11]] to i32 -// CHECK6-NEXT: [[ADD8:%.*]] = add nsw i32 [[CONV7]], 1 -// CHECK6-NEXT: [[CONV9:%.*]] = trunc i32 [[ADD8]] to i16 -// CHECK6-NEXT: store i16 [[CONV9]], i16* [[CONV]], align 4 -// CHECK6-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 2 -// CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -// CHECK6-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP12]], 1 -// CHECK6-NEXT: store i32 [[ADD10]], i32* [[ARRAYIDX]], align 4 -// CHECK6-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] -// CHECK6: .termination.notifier: -// CHECK6-NEXT: call void @__kmpc_kernel_deinit(i16 1) -// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK6-NEXT: br label [[DOTEXIT]] -// CHECK6: .exit: -// CHECK6-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44 +// CHECK2-SAME: (i32 [[N:%.*]], i32 [[A:%.*]], i32 [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[AA_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4 +// CHECK2-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[AA]], i32* [[AA_ADDR]], align 4 +// CHECK2-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16* +// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK2-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK2-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK2: .worker: +// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44_worker() #[[ATTR3]] +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .mastercheck: +// CHECK2-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK2-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK2-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1 +// CHECK2-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]] +// CHECK2-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK2-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK2: .master: +// CHECK2-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK2-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK2-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP7]], 1000 +// CHECK2-NEXT: [[TMP8:%.*]] = zext i1 [[CMP]] to i32 +// CHECK2-NEXT: [[TMP9:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 [[TMP8]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** [[TMP9]], i32 0) +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[A_ADDR]], align 4 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK2-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4 +// CHECK2-NEXT: [[TMP11:%.*]] = load i16, i16* [[CONV]], align 4 +// CHECK2-NEXT: [[CONV7:%.*]] = sext i16 [[TMP11]] to i32 +// CHECK2-NEXT: [[ADD8:%.*]] = add nsw i32 [[CONV7]], 1 +// CHECK2-NEXT: [[CONV9:%.*]] = trunc i32 [[ADD8]] to i16 +// CHECK2-NEXT: store i16 [[CONV9]], i16* [[CONV]], align 4 +// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 2 +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK2-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP12]], 1 +// CHECK2-NEXT: store i32 [[ADD10]], i32* [[ARRAYIDX]], align 4 +// CHECK2-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK2: .termination.notifier: +// CHECK2-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: br label [[DOTEXIT]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__3 -// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[A:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK6-NEXT: store i32 45, i32* [[A]], align 4 -// CHECK6-NEXT: [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 -// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP1]]) -// CHECK6-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[A:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32 45, i32* [[A]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP1]]) +// CHECK2-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper -// CHECK6-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 -// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK6-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -// CHECK6-NEXT: call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] -// CHECK6-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper +// CHECK2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK2-NEXT: call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l56_worker +// CHECK2-SAME: () #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK2-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK2-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK2: .await.work: +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK2-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK2-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK2-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK2-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK2: .select.workers: +// CHECK2-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK2-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK2: .execute.parallel: +// CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK2-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK2-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*) +// CHECK2-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] +// CHECK2: .execute.fn: +// CHECK2-NEXT: call void @__omp_outlined__4_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] +// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK2: .check.next: +// CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK2-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) +// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK2: .terminate.parallel: +// CHECK2-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK2-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK2: .barrier.parallel: +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58_worker -// CHECK6-SAME: () #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 -// CHECK6-NEXT: store i8* null, i8** [[WORK_FN]], align 4 -// CHECK6-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 -// CHECK6-NEXT: br label [[DOTAWAIT_WORK:%.*]] -// CHECK6: .await.work: -// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK6-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) -// CHECK6-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 -// CHECK6-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 -// CHECK6-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 -// CHECK6-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null -// CHECK6-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] -// CHECK6: .select.workers: -// CHECK6-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 -// CHECK6-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK6-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] -// CHECK6: .execute.parallel: -// CHECK6-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK6-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 -// CHECK6-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*) -// CHECK6-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] -// CHECK6: .execute.fn: -// CHECK6-NEXT: call void @__omp_outlined__4_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] -// CHECK6-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] -// CHECK6: .check.next: -// CHECK6-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* -// CHECK6-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) -// CHECK6-NEXT: br label [[DOTTERMINATE_PARALLEL]] -// CHECK6: .terminate.parallel: -// CHECK6-NEXT: call void @__kmpc_kernel_end_parallel() -// CHECK6-NEXT: br label [[DOTBARRIER_PARALLEL]] -// CHECK6: .barrier.parallel: -// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK6-NEXT: br label [[DOTAWAIT_WORK]] -// CHECK6: .exit: -// CHECK6-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l56 +// CHECK2-SAME: (i32 [[A:%.*]]) #[[ATTR1]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 +// CHECK2-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK2-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK2-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK2: .worker: +// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l56_worker() #[[ATTR3]] +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .mastercheck: +// CHECK2-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK2-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK2-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK2-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK2-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK2-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK2: .master: +// CHECK2-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[A_ADDR]], align 4 +// CHECK2-NEXT: [[A7:%.*]] = call i8* @__kmpc_alloc_shared(i32 4) +// CHECK2-NEXT: [[A_ON_STACK:%.*]] = bitcast i8* [[A7]] to i32* +// CHECK2-NEXT: store i32 [[TMP5]], i32* [[A_ON_STACK]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP8:%.*]] = bitcast i32* [[A_ON_STACK]] to i8* +// CHECK2-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 4 +// CHECK2-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__4 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*), i8** [[TMP9]], i32 1) +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[A_ON_STACK]], align 4 +// CHECK2-NEXT: [[INC:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK2-NEXT: store i32 [[INC]], i32* [[A_ON_STACK]], align 4 +// CHECK2-NEXT: call void @__kmpc_free_shared(i8* [[A7]]) +// CHECK2-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK2: .termination.notifier: +// CHECK2-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: br label [[DOTEXIT]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58 -// CHECK6-SAME: (i32 [[A:%.*]]) #[[ATTR1]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 -// CHECK6-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 -// CHECK6-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK6-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK6-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] -// CHECK6-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] -// CHECK6-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] -// CHECK6: .worker: -// CHECK6-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58_worker() #[[ATTR3]] -// CHECK6-NEXT: br label [[DOTEXIT:%.*]] -// CHECK6: .mastercheck: -// CHECK6-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK6-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK6-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK6-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 -// CHECK6-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 -// CHECK6-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 -// CHECK6-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] -// CHECK6-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] -// CHECK6-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] -// CHECK6: .master: -// CHECK6-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK6-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK6-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] -// CHECK6-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK6-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK6-NEXT: [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 4, i16 1) -// CHECK6-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty* -// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[A_ADDR]], align 4 -// CHECK6-NEXT: [[A7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP6]], i32 0, i32 0 -// CHECK6-NEXT: store i32 [[TMP7]], i32* [[A7]], align 4 -// CHECK6-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP10:%.*]] = bitcast i32* [[A7]] to i8* -// CHECK6-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 -// CHECK6-NEXT: [[TMP11:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP8]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__4 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*), i8** [[TMP11]], i32 1) -// CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[A7]], align 4 -// CHECK6-NEXT: [[INC:%.*]] = add nsw i32 [[TMP12]], 1 -// CHECK6-NEXT: store i32 [[INC]], i32* [[A7]], align 4 -// CHECK6-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]]) -// CHECK6-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] -// CHECK6: .termination.notifier: -// CHECK6-NEXT: call void @__kmpc_kernel_deinit(i16 1) -// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK6-NEXT: br label [[DOTEXIT]] -// CHECK6: .exit: -// CHECK6-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR1]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[CRITICAL_COUNTER:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_warp_active_thread_mask() +// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: store i32 0, i32* [[CRITICAL_COUNTER]], align 4 +// CHECK2-NEXT: br label [[OMP_CRITICAL_LOOP:%.*]] +// CHECK2: omp.critical.loop: +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = icmp slt i32 [[TMP2]], [[NVPTX_NUM_THREADS]] +// CHECK2-NEXT: br i1 [[TMP3]], label [[OMP_CRITICAL_TEST:%.*]], label [[OMP_CRITICAL_EXIT:%.*]] +// CHECK2: omp.critical.test: +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4 +// CHECK2-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID]], [[TMP4]] +// CHECK2-NEXT: br i1 [[TMP5]], label [[OMP_CRITICAL_BODY:%.*]], label [[OMP_CRITICAL_SYNC:%.*]] +// CHECK2: omp.critical.body: +// CHECK2-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 +// CHECK2-NEXT: call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var") +// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK2-NEXT: [[INC:%.*]] = add nsw i32 [[TMP8]], 1 +// CHECK2-NEXT: store i32 [[INC]], i32* [[TMP0]], align 4 +// CHECK2-NEXT: call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var") +// CHECK2-NEXT: br label [[OMP_CRITICAL_SYNC]] +// CHECK2: omp.critical.sync: +// CHECK2-NEXT: call void @__kmpc_syncwarp(i32 [[TMP1]]) +// CHECK2-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP4]], 1 +// CHECK2-NEXT: store i32 [[TMP9]], i32* [[CRITICAL_COUNTER]], align 4 +// CHECK2-NEXT: br label [[OMP_CRITICAL_LOOP]] +// CHECK2: omp.critical.exit: +// CHECK2-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__4 -// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR1]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[CRITICAL_COUNTER:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK6-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 -// CHECK6-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 -// CHECK6-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_warp_active_thread_mask() -// CHECK6-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK6-NEXT: store i32 0, i32* [[CRITICAL_COUNTER]], align 4 -// CHECK6-NEXT: br label [[OMP_CRITICAL_LOOP:%.*]] -// CHECK6: omp.critical.loop: -// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4 -// CHECK6-NEXT: [[TMP3:%.*]] = icmp slt i32 [[TMP2]], [[NVPTX_NUM_THREADS]] -// CHECK6-NEXT: br i1 [[TMP3]], label [[OMP_CRITICAL_TEST:%.*]], label [[OMP_CRITICAL_EXIT:%.*]] -// CHECK6: omp.critical.test: -// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4 -// CHECK6-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID]], [[TMP4]] -// CHECK6-NEXT: br i1 [[TMP5]], label [[OMP_CRITICAL_BODY:%.*]], label [[OMP_CRITICAL_SYNC:%.*]] -// CHECK6: omp.critical.body: -// CHECK6-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 -// CHECK6-NEXT: call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var") -// CHECK6-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP0]], align 4 -// CHECK6-NEXT: [[INC:%.*]] = add nsw i32 [[TMP8]], 1 -// CHECK6-NEXT: store i32 [[INC]], i32* [[TMP0]], align 4 -// CHECK6-NEXT: call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var") -// CHECK6-NEXT: br label [[OMP_CRITICAL_SYNC]] -// CHECK6: omp.critical.sync: -// CHECK6-NEXT: call void @__kmpc_syncwarp(i32 [[TMP1]]) -// CHECK6-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP4]], 1 -// CHECK6-NEXT: store i32 [[TMP9]], i32* [[CRITICAL_COUNTER]], align 4 -// CHECK6-NEXT: br label [[OMP_CRITICAL_LOOP]] -// CHECK6: omp.critical.exit: -// CHECK6-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__4_wrapper +// CHECK2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK2-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i32 0 +// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** +// CHECK2-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 4 +// CHECK2-NEXT: call void @__omp_outlined__4(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]] +// CHECK2-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__4_wrapper -// CHECK6-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 -// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK6-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -// CHECK6-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 4 -// CHECK6-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i32 0 -// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** -// CHECK6-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 4 -// CHECK6-NEXT: call void @__omp_outlined__4(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]] -// CHECK6-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l27_worker +// CHECK3-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK3-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK3-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK3: .await.work: +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK3-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK3-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK3-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK3-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK3: .select.workers: +// CHECK3-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK3-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK3: .execute.parallel: +// CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK3-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK3-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*) +// CHECK3-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] +// CHECK3: .execute.fn: +// CHECK3-NEXT: call void @__omp_outlined___wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3:[0-9]+]] +// CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK3: .check.next: +// CHECK3-NEXT: [[TMP6:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK3-NEXT: [[WORK_MATCH1:%.*]] = icmp eq i8* [[TMP6]], bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*) +// CHECK3-NEXT: br i1 [[WORK_MATCH1]], label [[DOTEXECUTE_FN2:%.*]], label [[DOTCHECK_NEXT3:%.*]] +// CHECK3: .execute.fn2: +// CHECK3-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] +// CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK3: .check.next3: +// CHECK3-NEXT: [[TMP7:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK3-NEXT: [[WORK_MATCH4:%.*]] = icmp eq i8* [[TMP7]], bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*) +// CHECK3-NEXT: br i1 [[WORK_MATCH4]], label [[DOTEXECUTE_FN5:%.*]], label [[DOTCHECK_NEXT6:%.*]] +// CHECK3: .execute.fn5: +// CHECK3-NEXT: call void @__omp_outlined__2_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] +// CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK3: .check.next6: +// CHECK3-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK3-NEXT: call void [[TMP8]](i16 0, i32 [[TMP4]]) +// CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK3: .terminate.parallel: +// CHECK3-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK3-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK3: .barrier.parallel: +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l27 +// CHECK3-SAME: (i32 [[A:%.*]]) #[[ATTR1:[0-9]+]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4 +// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS7:%.*]] = alloca [0 x i8*], align 4 +// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS8:%.*]] = alloca [0 x i8*], align 4 +// CHECK3-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK3-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK3-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK3: .worker: +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l27_worker() #[[ATTR3]] +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .mastercheck: +// CHECK3-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK3-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK3-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK3-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK3-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK3-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK3: .master: +// CHECK3-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK3-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK3-NEXT: [[TMP6:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP6]], i32 0) +// CHECK3-NEXT: [[TMP7:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS7]] to i8** +// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 0, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP7]], i32 0) +// CHECK3-NEXT: [[TMP8:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS8]] to i8** +// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** [[TMP8]], i32 0) +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[A_ADDR]], align 4 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP9]], 1 +// CHECK3-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4 +// CHECK3-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK3: .termination.notifier: +// CHECK3-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: br label [[DOTEXIT]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[A:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32 42, i32* [[A]], align 4 +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined___wrapper +// CHECK3-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK3-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK3-NEXT: call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[A:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32 43, i32* [[A]], align 4 +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper +// CHECK3-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK3-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK3-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[A:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32 44, i32* [[A]], align 4 +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__2_wrapper +// CHECK3-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK3-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK3-NEXT: call void @__omp_outlined__2(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44_worker +// CHECK3-SAME: () #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK3-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK3-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK3: .await.work: +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK3-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK3-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK3-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK3-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK3: .select.workers: +// CHECK3-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK3-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK3: .execute.parallel: +// CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK3-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK3-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*) +// CHECK3-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] +// CHECK3: .execute.fn: +// CHECK3-NEXT: call void @__omp_outlined__3_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] +// CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK3: .check.next: +// CHECK3-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK3-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) +// CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK3: .terminate.parallel: +// CHECK3-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK3-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK3: .barrier.parallel: +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44 +// CHECK3-SAME: (i32 [[N:%.*]], i32 [[A:%.*]], i32 [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[AA_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4 +// CHECK3-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[AA]], i32* [[AA_ADDR]], align 4 +// CHECK3-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK3-NEXT: [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16* +// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK3-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK3-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK3: .worker: +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44_worker() #[[ATTR3]] +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .mastercheck: +// CHECK3-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK3-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK3-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1 +// CHECK3-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]] +// CHECK3-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK3-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK3: .master: +// CHECK3-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK3-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP7]], 1000 +// CHECK3-NEXT: [[TMP8:%.*]] = zext i1 [[CMP]] to i32 +// CHECK3-NEXT: [[TMP9:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 [[TMP8]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** [[TMP9]], i32 0) +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[A_ADDR]], align 4 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK3-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = load i16, i16* [[CONV]], align 4 +// CHECK3-NEXT: [[CONV7:%.*]] = sext i16 [[TMP11]] to i32 +// CHECK3-NEXT: [[ADD8:%.*]] = add nsw i32 [[CONV7]], 1 +// CHECK3-NEXT: [[CONV9:%.*]] = trunc i32 [[ADD8]] to i16 +// CHECK3-NEXT: store i16 [[CONV9]], i16* [[CONV]], align 4 +// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 2 +// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK3-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP12]], 1 +// CHECK3-NEXT: store i32 [[ADD10]], i32* [[ARRAYIDX]], align 4 +// CHECK3-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK3: .termination.notifier: +// CHECK3-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: br label [[DOTEXIT]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[A:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32 45, i32* [[A]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP1]]) +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper +// CHECK3-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK3-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK3-NEXT: call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l56_worker +// CHECK3-SAME: () #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK3-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK3-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK3: .await.work: +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK3-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK3-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK3-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK3-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK3: .select.workers: +// CHECK3-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK3-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK3: .execute.parallel: +// CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK3-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK3-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*) +// CHECK3-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] +// CHECK3: .execute.fn: +// CHECK3-NEXT: call void @__omp_outlined__4_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] +// CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK3: .check.next: +// CHECK3-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK3-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) +// CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK3: .terminate.parallel: +// CHECK3-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK3-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK3: .barrier.parallel: +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l56 +// CHECK3-SAME: (i32 [[A:%.*]]) #[[ATTR1]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 +// CHECK3-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK3-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK3-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK3: .worker: +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l56_worker() #[[ATTR3]] +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .mastercheck: +// CHECK3-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK3-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK3-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK3-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK3-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK3-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK3: .master: +// CHECK3-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[A_ADDR]], align 4 +// CHECK3-NEXT: [[A7:%.*]] = call i8* @__kmpc_alloc_shared(i32 4) +// CHECK3-NEXT: [[A_ON_STACK:%.*]] = bitcast i8* [[A7]] to i32* +// CHECK3-NEXT: store i32 [[TMP5]], i32* [[A_ON_STACK]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP8:%.*]] = bitcast i32* [[A_ON_STACK]] to i8* +// CHECK3-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 4 +// CHECK3-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__4 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*), i8** [[TMP9]], i32 1) +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[A_ON_STACK]], align 4 +// CHECK3-NEXT: [[INC:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK3-NEXT: store i32 [[INC]], i32* [[A_ON_STACK]], align 4 +// CHECK3-NEXT: call void @__kmpc_free_shared(i8* [[A7]]) +// CHECK3-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK3: .termination.notifier: +// CHECK3-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: br label [[DOTEXIT]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR1]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[CRITICAL_COUNTER:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_warp_active_thread_mask() +// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: store i32 0, i32* [[CRITICAL_COUNTER]], align 4 +// CHECK3-NEXT: br label [[OMP_CRITICAL_LOOP:%.*]] +// CHECK3: omp.critical.loop: +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = icmp slt i32 [[TMP2]], [[NVPTX_NUM_THREADS]] +// CHECK3-NEXT: br i1 [[TMP3]], label [[OMP_CRITICAL_TEST:%.*]], label [[OMP_CRITICAL_EXIT:%.*]] +// CHECK3: omp.critical.test: +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID]], [[TMP4]] +// CHECK3-NEXT: br i1 [[TMP5]], label [[OMP_CRITICAL_BODY:%.*]], label [[OMP_CRITICAL_SYNC:%.*]] +// CHECK3: omp.critical.body: +// CHECK3-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 +// CHECK3-NEXT: call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var") +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK3-NEXT: [[INC:%.*]] = add nsw i32 [[TMP8]], 1 +// CHECK3-NEXT: store i32 [[INC]], i32* [[TMP0]], align 4 +// CHECK3-NEXT: call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var") +// CHECK3-NEXT: br label [[OMP_CRITICAL_SYNC]] +// CHECK3: omp.critical.sync: +// CHECK3-NEXT: call void @__kmpc_syncwarp(i32 [[TMP1]]) +// CHECK3-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP4]], 1 +// CHECK3-NEXT: store i32 [[TMP9]], i32* [[CRITICAL_COUNTER]], align 4 +// CHECK3-NEXT: br label [[OMP_CRITICAL_LOOP]] +// CHECK3: omp.critical.exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__4_wrapper +// CHECK3-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK3-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK3-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i32 0 +// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** +// CHECK3-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 4 +// CHECK3-NEXT: call void @__omp_outlined__4(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]] +// CHECK3-NEXT: ret void // diff --git a/clang/test/OpenMP/nvptx_parallel_for_codegen.cpp b/clang/test/OpenMP/nvptx_parallel_for_codegen.cpp --- a/clang/test/OpenMP/nvptx_parallel_for_codegen.cpp +++ b/clang/test/OpenMP/nvptx_parallel_for_codegen.cpp @@ -2,7 +2,6 @@ // Test target codegen - host bc file has to be created first. // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK1 -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK2 // expected-no-diagnostics #ifndef HEADER #define HEADER @@ -33,228 +32,6 @@ } #endif -// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l14_worker -// CHECK1-SAME: () #[[ATTR0:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 -// CHECK1-NEXT: store i8* null, i8** [[WORK_FN]], align 8 -// CHECK1-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 -// CHECK1-NEXT: br label [[DOTAWAIT_WORK:%.*]] -// CHECK1: .await.work: -// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK1-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) -// CHECK1-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 -// CHECK1-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 -// CHECK1-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 -// CHECK1-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null -// CHECK1-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] -// CHECK1: .select.workers: -// CHECK1-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 -// CHECK1-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK1-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] -// CHECK1: .execute.parallel: -// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) -// CHECK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 8 -// CHECK1-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*) -// CHECK1-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] -// CHECK1: .execute.fn: -// CHECK1-NEXT: call void @__omp_outlined___wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3:[0-9]+]] -// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] -// CHECK1: .check.next: -// CHECK1-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* -// CHECK1-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) -// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL]] -// CHECK1: .terminate.parallel: -// CHECK1-NEXT: call void @__kmpc_kernel_end_parallel() -// CHECK1-NEXT: br label [[DOTBARRIER_PARALLEL]] -// CHECK1: .barrier.parallel: -// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK1-NEXT: br label [[DOTAWAIT_WORK]] -// CHECK1: .exit: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l14 -// CHECK1-SAME: (i64 [[N:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 -// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 8 -// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK1-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 -// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 -// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK1-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] -// CHECK1-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] -// CHECK1-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] -// CHECK1: .worker: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l14_worker() #[[ATTR3]] -// CHECK1-NEXT: br label [[DOTEXIT:%.*]] -// CHECK1: .mastercheck: -// CHECK1-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK1-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK1-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 -// CHECK1-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 -// CHECK1-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1 -// CHECK1-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]] -// CHECK1-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] -// CHECK1-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] -// CHECK1: .master: -// CHECK1-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK1-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] -// CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK1-NEXT: [[TMP6:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK1-NEXT: [[TMP7:%.*]] = load i64, i64* @"_openmp_static_kernel$size", align 8 -// CHECK1-NEXT: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i64 [[TMP7]], i16 [[TMP6]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) -// CHECK1-NEXT: [[TMP8:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 8 -// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, i8* [[TMP8]], i64 0 -// CHECK1-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to %struct._globalized_locals_ty* -// CHECK1-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP10]], i32 0, i32 0 -// CHECK1-NEXT: [[TMP11:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) -// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: store i32 [[TMP12]], i32* [[D]], align 4 -// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* -// CHECK1-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 -// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP16:%.*]] = bitcast i32* [[D]] to i8* -// CHECK1-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 -// CHECK1-NEXT: [[TMP17:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP11]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, [10 x i32]*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP17]], i64 2) -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i64 0, i64 3 -// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], 1 -// CHECK1-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX]], align 4 -// CHECK1-NEXT: [[TMP19:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK1-NEXT: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP19]]) -// CHECK1-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] -// CHECK1: .termination.notifier: -// CHECK1-NEXT: call void @__kmpc_kernel_deinit(i16 1) -// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK1-NEXT: br label [[DOTEXIT]] -// CHECK1: .exit: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], i32* nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 -// CHECK1-NEXT: [[D_ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 -// CHECK1-NEXT: store i32* [[D]], i32** [[D_ADDR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 -// CHECK1-NEXT: [[TMP1:%.*]] = load i32*, i32** [[D_ADDR]], align 8 -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4 -// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP3]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK1-NEXT: br label [[OMP_DISPATCH_COND:%.*]] -// CHECK1: omp.dispatch.cond: -// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 9 -// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK1: cond.true: -// CHECK1-NEXT: br label [[COND_END:%.*]] -// CHECK1: cond.false: -// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: br label [[COND_END]] -// CHECK1: cond.end: -// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ] -// CHECK1-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]] -// CHECK1-NEXT: br i1 [[CMP1]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] -// CHECK1: omp.dispatch.body: -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK1: omp.inner.for.cond: -// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: [[CMP2:%.*]] = icmp sle i32 [[TMP9]], [[TMP10]] -// CHECK1-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK1: omp.inner.for.body: -// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP11]], 1 -// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK1-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP1]], align 4 -// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[I]], align 4 -// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP13]] to i64 -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM]] -// CHECK1-NEXT: [[TMP14:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -// CHECK1-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP14]], [[TMP12]] -// CHECK1-NEXT: store i32 [[ADD3]], i32* [[ARRAYIDX]], align 4 -// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK1: omp.body.continue: -// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK1: omp.inner.for.inc: -// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP15]], 1 -// CHECK1-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK1: omp.inner.for.end: -// CHECK1-NEXT: br label [[OMP_DISPATCH_INC:%.*]] -// CHECK1: omp.dispatch.inc: -// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP16]], [[TMP17]] -// CHECK1-NEXT: store i32 [[ADD5]], i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] -// CHECK1-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: br label [[OMP_DISPATCH_COND]] -// CHECK1: omp.dispatch.end: -// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP3]]) -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined___wrapper -// CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 -// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK1-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -// CHECK1-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8 -// CHECK1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 0 -// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to [10 x i32]** -// CHECK1-NEXT: [[TMP5:%.*]] = load [10 x i32]*, [10 x i32]** [[TMP4]], align 8 -// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 1 -// CHECK1-NEXT: [[TMP7:%.*]] = bitcast i8** [[TMP6]] to i32** -// CHECK1-NEXT: [[TMP8:%.*]] = load i32*, i32** [[TMP7]], align 8 -// CHECK1-NEXT: call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP5]], i32* [[TMP8]]) #[[ATTR3]] -// CHECK1-NEXT: ret void -// -// // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l14_worker // CHECK2-SAME: () #[[ATTR0:[0-9]+]] { // CHECK2-NEXT: entry: @@ -295,8 +72,6 @@ // CHECK2-NEXT: br label [[DOTAWAIT_WORK]] // CHECK2: .exit: // CHECK2-NEXT: ret void -// -// // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l14 // CHECK2-SAME: (i64 [[N:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1:[0-9]+]] { // CHECK2-NEXT: entry: @@ -358,8 +133,6 @@ // CHECK2-NEXT: br label [[DOTEXIT]] // CHECK2: .exit: // CHECK2-NEXT: ret void -// -// // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__ // CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], i32* nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] { // CHECK2-NEXT: entry: @@ -448,8 +221,6 @@ // CHECK2: omp.dispatch.end: // CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP3]]) // CHECK2-NEXT: ret void -// -// // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined___wrapper // CHECK2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { // CHECK2-NEXT: entry: @@ -470,4 +241,217 @@ // CHECK2-NEXT: [[TMP8:%.*]] = load i32*, i32** [[TMP7]], align 8 // CHECK2-NEXT: call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP5]], i32* [[TMP8]]) #[[ATTR3]] // CHECK2-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l13_worker +// CHECK1-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK1-NEXT: store i8* null, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK1: .await.work: +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK1-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK1-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK1-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK1: .select.workers: +// CHECK1-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK1-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK1: .execute.parallel: +// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) +// CHECK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*) +// CHECK1-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] +// CHECK1: .execute.fn: +// CHECK1-NEXT: call void @__omp_outlined___wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3:[0-9]+]] +// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK1: .check.next: +// CHECK1-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK1-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) +// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK1: .terminate.parallel: +// CHECK1-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK1-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK1: .barrier.parallel: +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l13 +// CHECK1-SAME: (i64 [[N:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1:[0-9]+]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 8 +// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK1-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK1-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK1-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK1: .worker: +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l13_worker() #[[ATTR3]] +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .mastercheck: +// CHECK1-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK1-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK1-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1 +// CHECK1-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]] +// CHECK1-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK1-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK1: .master: +// CHECK1-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK1-NEXT: [[D:%.*]] = call i8* @__kmpc_alloc_shared(i64 4) +// CHECK1-NEXT: [[D_ON_STACK:%.*]] = bitcast i8* [[D]] to i32* +// CHECK1-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: store i32 [[TMP7]], i32* [[D_ON_STACK]], align 4 +// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP9:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK1-NEXT: store i8* [[TMP9]], i8** [[TMP8]], align 8 +// CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP11:%.*]] = bitcast i32* [[D_ON_STACK]] to i8* +// CHECK1-NEXT: store i8* [[TMP11]], i8** [[TMP10]], align 8 +// CHECK1-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP6]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, [10 x i32]*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP12]], i64 2) +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i64 0, i64 3 +// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK1-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX]], align 4 +// CHECK1-NEXT: call void @__kmpc_free_shared(i8* [[D]]) +// CHECK1-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK1: .termination.notifier: +// CHECK1-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: br label [[DOTEXIT]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], i32* nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK1-NEXT: [[D_ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: store i32* [[D]], i32** [[D_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load i32*, i32** [[D_ADDR]], align 8 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP3]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK1-NEXT: br label [[OMP_DISPATCH_COND:%.*]] +// CHECK1: omp.dispatch.cond: +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 9 +// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK1: cond.true: +// CHECK1-NEXT: br label [[COND_END:%.*]] +// CHECK1: cond.false: +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END]] +// CHECK1: cond.end: +// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ] +// CHECK1-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]] +// CHECK1-NEXT: br i1 [[CMP1]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] +// CHECK1: omp.dispatch.body: +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[CMP2:%.*]] = icmp sle i32 [[TMP9]], [[TMP10]] +// CHECK1-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP11]], 1 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK1-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[I]], align 4 +// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP13]] to i64 +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK1-NEXT: [[TMP14:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK1-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP14]], [[TMP12]] +// CHECK1-NEXT: store i32 [[ADD3]], i32* [[ARRAYIDX]], align 4 +// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK1: omp.body.continue: +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP15]], 1 +// CHECK1-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_DISPATCH_INC:%.*]] +// CHECK1: omp.dispatch.inc: +// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP16]], [[TMP17]] +// CHECK1-NEXT: store i32 [[ADD5]], i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] +// CHECK1-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: br label [[OMP_DISPATCH_COND]] +// CHECK1: omp.dispatch.end: +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP3]]) +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined___wrapper +// CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK1-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 0 +// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to [10 x i32]** +// CHECK1-NEXT: [[TMP5:%.*]] = load [10 x i32]*, [10 x i32]** [[TMP4]], align 8 +// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 1 +// CHECK1-NEXT: [[TMP7:%.*]] = bitcast i8** [[TMP6]] to i32** +// CHECK1-NEXT: [[TMP8:%.*]] = load i32*, i32** [[TMP7]], align 8 +// CHECK1-NEXT: call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP5]], i32* [[TMP8]]) #[[ATTR3]] +// CHECK1-NEXT: ret void // diff --git a/clang/test/OpenMP/nvptx_target_codegen.cpp b/clang/test/OpenMP/nvptx_target_codegen.cpp --- a/clang/test/OpenMP/nvptx_target_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_codegen.cpp @@ -155,7 +155,6 @@ // CHECK1-NEXT: [[TMP0:%.*]] = load i32**, i32*** [[PTR2_ADDR]], align 8 // CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK1: .execute: // CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) @@ -257,7 +256,6 @@ // CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK1-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] // CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK1-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] // CHECK1: .termination.notifier: // CHECK1-NEXT: call void @__kmpc_kernel_deinit(i16 1) @@ -332,7 +330,6 @@ // CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK1-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] // CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK1-NEXT: [[TMP5:%.*]] = load i16, i16* [[CONV]], align 8 // CHECK1-NEXT: [[CONV7:%.*]] = sext i16 [[TMP5]] to i32 // CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV7]], 1 @@ -441,7 +438,6 @@ // CHECK1-NEXT: [[NVPTX_WARP_SIZE9:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK1-NEXT: [[THREAD_LIMIT10:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS8]], [[NVPTX_WARP_SIZE9]] // CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT10]], i16 1) -// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[CONV]], align 8 // CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 // CHECK1-NEXT: store i32 [[ADD]], i32* [[CONV]], align 8 @@ -577,7 +573,6 @@ // CHECK1-NEXT: [[NVPTX_WARP_SIZE7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK1-NEXT: [[THREAD_LIMIT8:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS6]], [[NVPTX_WARP_SIZE7]] // CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT8]], i16 1) -// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[CONV]], align 8 // CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP6]], 1 // CHECK1-NEXT: store i32 [[ADD]], i32* [[CONV]], align 8 @@ -681,7 +676,6 @@ // CHECK1-NEXT: [[NVPTX_WARP_SIZE7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK1-NEXT: [[THREAD_LIMIT8:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS6]], [[NVPTX_WARP_SIZE7]] // CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT8]], i16 1) -// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[CONV]], align 8 // CHECK1-NEXT: [[CONV9:%.*]] = sitofp i32 [[TMP9]] to double // CHECK1-NEXT: [[ADD:%.*]] = fadd double [[CONV9]], 1.500000e+00 @@ -711,56 +705,27 @@ // // // CHECK1-LABEL: define {{[^@]+}}@_Z3baziRd -// CHECK1-SAME: (i32 [[F3:%.*]], double* nonnull align 8 dereferenceable(8) [[A:%.*]]) #[[ATTR5]] { +// CHECK1-SAME: (i32 [[F1:%.*]], double* nonnull align 8 dereferenceable(8) [[A:%.*]]) #[[ATTR5]] { // CHECK1-NEXT: entry: -// CHECK1-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[F2:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[A_ADDR:%.*]] = alloca double*, align 8 // CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 8 // CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) -// CHECK1-NEXT: [[TMP1:%.*]] = call i16 @__kmpc_parallel_level(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]]) -// CHECK1-NEXT: [[TMP2:%.*]] = icmp eq i16 [[TMP1]], 0 -// CHECK1-NEXT: [[TMP3:%.*]] = call i8 @__kmpc_is_spmd_exec_mode() #[[ATTR2]] -// CHECK1-NEXT: [[TMP4:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK1-NEXT: br i1 [[TMP4]], label [[DOTSPMD:%.*]], label [[DOTNON_SPMD:%.*]] -// CHECK1: .spmd: -// CHECK1-NEXT: br label [[DOTEXIT:%.*]] -// CHECK1: .non-spmd: -// CHECK1-NEXT: [[TMP5:%.*]] = select i1 [[TMP2]], i64 4, i64 128 -// CHECK1-NEXT: [[TMP6:%.*]] = call i8* @__kmpc_data_sharing_coalesced_push_stack(i64 [[TMP5]], i16 0) -// CHECK1-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to %struct._globalized_locals_ty* -// CHECK1-NEXT: br label [[DOTEXIT]] -// CHECK1: .exit: -// CHECK1-NEXT: [[_SELECT_STACK:%.*]] = phi %struct._globalized_locals_ty* [ null, [[DOTSPMD]] ], [ [[TMP7]], [[DOTNON_SPMD]] ] -// CHECK1-NEXT: [[TMP8:%.*]] = bitcast %struct._globalized_locals_ty* [[_SELECT_STACK]] to %struct._globalized_locals_ty.0* -// CHECK1-NEXT: [[F:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[_SELECT_STACK]], i32 0, i32 0 -// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK1-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID]], 31 -// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [32 x i32], [32 x i32]* [[F]], i32 0, i32 [[NVPTX_LANE_ID]] -// CHECK1-NEXT: [[F1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP8]], i32 0, i32 0 -// CHECK1-NEXT: [[TMP10:%.*]] = select i1 [[TMP2]], i32* [[F1]], i32* [[TMP9]] -// CHECK1-NEXT: [[TMP11:%.*]] = select i1 [[TMP4]], i32* [[F2]], i32* [[TMP10]] -// CHECK1-NEXT: store i32 [[F3]], i32* [[TMP11]], align 4 +// CHECK1-NEXT: [[F:%.*]] = call i8* @__kmpc_alloc_shared(i64 4) +// CHECK1-NEXT: [[F_ON_STACK:%.*]] = bitcast i8* [[F]] to i32* +// CHECK1-NEXT: store i32 [[F1]], i32* [[F_ON_STACK]], align 4 // CHECK1-NEXT: store double* [[A]], double** [[A_ADDR]], align 8 -// CHECK1-NEXT: [[TMP12:%.*]] = load double*, double** [[A_ADDR]], align 8 -// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP11]] to i8* -// CHECK1-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 -// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP16:%.*]] = bitcast double* [[TMP12]] to i8* -// CHECK1-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 -// CHECK1-NEXT: [[TMP17:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, double*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP17]], i64 2) -// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP11]], align 4 -// CHECK1-NEXT: store i32 [[TMP18]], i32* [[RETVAL]], align 4 -// CHECK1-NEXT: br i1 [[TMP4]], label [[DOTEXIT5:%.*]], label [[DOTNON_SPMD4:%.*]] -// CHECK1: .non-spmd4: -// CHECK1-NEXT: [[TMP19:%.*]] = bitcast %struct._globalized_locals_ty* [[_SELECT_STACK]] to i8* -// CHECK1-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP19]]) -// CHECK1-NEXT: br label [[DOTEXIT5]] -// CHECK1: .exit5: -// CHECK1-NEXT: [[TMP20:%.*]] = load i32, i32* [[RETVAL]], align 4 -// CHECK1-NEXT: ret i32 [[TMP20]] +// CHECK1-NEXT: [[TMP1:%.*]] = load double*, double** [[A_ADDR]], align 8 +// CHECK1-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP3:%.*]] = bitcast i32* [[F_ON_STACK]] to i8* +// CHECK1-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP5:%.*]] = bitcast double* [[TMP1]] to i8* +// CHECK1-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 8 +// CHECK1-NEXT: [[TMP6:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, double*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP6]], i64 2) +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[F_ON_STACK]], align 4 +// CHECK1-NEXT: call void @__kmpc_free_shared(i8* [[F]]) +// CHECK1-NEXT: ret i32 [[TMP7]] // // // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1 @@ -870,7 +835,6 @@ // CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK1-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] // CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK1-NEXT: call void @_Z6asserti(i32 0) #[[ATTR8:[0-9]+]] // CHECK1-NEXT: unreachable // CHECK1: 5: @@ -954,7 +918,6 @@ // CHECK1-NEXT: [[NVPTX_WARP_SIZE6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK1-NEXT: [[THREAD_LIMIT7:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS5]], [[NVPTX_WARP_SIZE6]] // CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT7]], i16 1) -// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[CONV]], align 8 // CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP6]], 1 // CHECK1-NEXT: store i32 [[ADD]], i32* [[CONV]], align 8 @@ -987,7 +950,6 @@ // CHECK2-NEXT: [[TMP0:%.*]] = load i32**, i32*** [[PTR2_ADDR]], align 4 // CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK2: .execute: // CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) @@ -1089,7 +1051,6 @@ // CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] // CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK2-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] // CHECK2: .termination.notifier: // CHECK2-NEXT: call void @__kmpc_kernel_deinit(i16 1) @@ -1164,7 +1125,6 @@ // CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] // CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK2-NEXT: [[TMP5:%.*]] = load i16, i16* [[CONV]], align 4 // CHECK2-NEXT: [[CONV7:%.*]] = sext i16 [[TMP5]] to i32 // CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV7]], 1 @@ -1272,7 +1232,6 @@ // CHECK2-NEXT: [[NVPTX_WARP_SIZE9:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK2-NEXT: [[THREAD_LIMIT10:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS8]], [[NVPTX_WARP_SIZE9]] // CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT10]], i16 1) -// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[A_ADDR]], align 4 // CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 // CHECK2-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4 @@ -1407,7 +1366,6 @@ // CHECK2-NEXT: [[NVPTX_WARP_SIZE6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK2-NEXT: [[THREAD_LIMIT7:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS5]], [[NVPTX_WARP_SIZE6]] // CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT7]], i16 1) -// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[A_ADDR]], align 4 // CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP6]], 1 // CHECK2-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4 @@ -1510,7 +1468,6 @@ // CHECK2-NEXT: [[NVPTX_WARP_SIZE7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK2-NEXT: [[THREAD_LIMIT8:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS6]], [[NVPTX_WARP_SIZE7]] // CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT8]], i16 1) -// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[B_ADDR]], align 4 // CHECK2-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP9]] to double // CHECK2-NEXT: [[ADD:%.*]] = fadd double [[CONV]], 1.500000e+00 @@ -1540,56 +1497,27 @@ // // // CHECK2-LABEL: define {{[^@]+}}@_Z3baziRd -// CHECK2-SAME: (i32 [[F3:%.*]], double* nonnull align 8 dereferenceable(8) [[A:%.*]]) #[[ATTR5]] { +// CHECK2-SAME: (i32 [[F1:%.*]], double* nonnull align 8 dereferenceable(8) [[A:%.*]]) #[[ATTR5]] { // CHECK2-NEXT: entry: -// CHECK2-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[F2:%.*]] = alloca i32, align 4 // CHECK2-NEXT: [[A_ADDR:%.*]] = alloca double*, align 4 // CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4 // CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) -// CHECK2-NEXT: [[TMP1:%.*]] = call i16 @__kmpc_parallel_level(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]]) -// CHECK2-NEXT: [[TMP2:%.*]] = icmp eq i16 [[TMP1]], 0 -// CHECK2-NEXT: [[TMP3:%.*]] = call i8 @__kmpc_is_spmd_exec_mode() #[[ATTR2]] -// CHECK2-NEXT: [[TMP4:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK2-NEXT: br i1 [[TMP4]], label [[DOTSPMD:%.*]], label [[DOTNON_SPMD:%.*]] -// CHECK2: .spmd: -// CHECK2-NEXT: br label [[DOTEXIT:%.*]] -// CHECK2: .non-spmd: -// CHECK2-NEXT: [[TMP5:%.*]] = select i1 [[TMP2]], i32 4, i32 128 -// CHECK2-NEXT: [[TMP6:%.*]] = call i8* @__kmpc_data_sharing_coalesced_push_stack(i32 [[TMP5]], i16 0) -// CHECK2-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to %struct._globalized_locals_ty* -// CHECK2-NEXT: br label [[DOTEXIT]] -// CHECK2: .exit: -// CHECK2-NEXT: [[_SELECT_STACK:%.*]] = phi %struct._globalized_locals_ty* [ null, [[DOTSPMD]] ], [ [[TMP7]], [[DOTNON_SPMD]] ] -// CHECK2-NEXT: [[TMP8:%.*]] = bitcast %struct._globalized_locals_ty* [[_SELECT_STACK]] to %struct._globalized_locals_ty.0* -// CHECK2-NEXT: [[F:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[_SELECT_STACK]], i32 0, i32 0 -// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK2-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID]], 31 -// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [32 x i32], [32 x i32]* [[F]], i32 0, i32 [[NVPTX_LANE_ID]] -// CHECK2-NEXT: [[F1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP8]], i32 0, i32 0 -// CHECK2-NEXT: [[TMP10:%.*]] = select i1 [[TMP2]], i32* [[F1]], i32* [[TMP9]] -// CHECK2-NEXT: [[TMP11:%.*]] = select i1 [[TMP4]], i32* [[F2]], i32* [[TMP10]] -// CHECK2-NEXT: store i32 [[F3]], i32* [[TMP11]], align 4 +// CHECK2-NEXT: [[F:%.*]] = call i8* @__kmpc_alloc_shared(i32 4) +// CHECK2-NEXT: [[F_ON_STACK:%.*]] = bitcast i8* [[F]] to i32* +// CHECK2-NEXT: store i32 [[F1]], i32* [[F_ON_STACK]], align 4 // CHECK2-NEXT: store double* [[A]], double** [[A_ADDR]], align 4 -// CHECK2-NEXT: [[TMP12:%.*]] = load double*, double** [[A_ADDR]], align 4 -// CHECK2-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK2-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP11]] to i8* -// CHECK2-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 -// CHECK2-NEXT: [[TMP15:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK2-NEXT: [[TMP16:%.*]] = bitcast double* [[TMP12]] to i8* -// CHECK2-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 4 -// CHECK2-NEXT: [[TMP17:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, double*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP17]], i32 2) -// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP11]], align 4 -// CHECK2-NEXT: store i32 [[TMP18]], i32* [[RETVAL]], align 4 -// CHECK2-NEXT: br i1 [[TMP4]], label [[DOTEXIT5:%.*]], label [[DOTNON_SPMD4:%.*]] -// CHECK2: .non-spmd4: -// CHECK2-NEXT: [[TMP19:%.*]] = bitcast %struct._globalized_locals_ty* [[_SELECT_STACK]] to i8* -// CHECK2-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP19]]) -// CHECK2-NEXT: br label [[DOTEXIT5]] -// CHECK2: .exit5: -// CHECK2-NEXT: [[TMP20:%.*]] = load i32, i32* [[RETVAL]], align 4 -// CHECK2-NEXT: ret i32 [[TMP20]] +// CHECK2-NEXT: [[TMP1:%.*]] = load double*, double** [[A_ADDR]], align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP3:%.*]] = bitcast i32* [[F_ON_STACK]] to i8* +// CHECK2-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP5:%.*]] = bitcast double* [[TMP1]] to i8* +// CHECK2-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, double*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP6]], i32 2) +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[F_ON_STACK]], align 4 +// CHECK2-NEXT: call void @__kmpc_free_shared(i8* [[F]]) +// CHECK2-NEXT: ret i32 [[TMP7]] // // // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1 @@ -1699,7 +1627,6 @@ // CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] // CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK2-NEXT: call void @_Z6asserti(i32 0) #[[ATTR8:[0-9]+]] // CHECK2-NEXT: unreachable // CHECK2: 5: @@ -1782,7 +1709,6 @@ // CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] // CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[A_ADDR]], align 4 // CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP6]], 1 // CHECK2-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4 @@ -1815,7 +1741,6 @@ // CHECK3-NEXT: [[TMP0:%.*]] = load i32**, i32*** [[PTR2_ADDR]], align 4 // CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK3: .execute: // CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) @@ -1917,7 +1842,6 @@ // CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK3-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] // CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK3-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] // CHECK3: .termination.notifier: // CHECK3-NEXT: call void @__kmpc_kernel_deinit(i16 1) @@ -1992,7 +1916,6 @@ // CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK3-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] // CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK3-NEXT: [[TMP5:%.*]] = load i16, i16* [[CONV]], align 4 // CHECK3-NEXT: [[CONV7:%.*]] = sext i16 [[TMP5]] to i32 // CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV7]], 1 @@ -2100,7 +2023,6 @@ // CHECK3-NEXT: [[NVPTX_WARP_SIZE9:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK3-NEXT: [[THREAD_LIMIT10:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS8]], [[NVPTX_WARP_SIZE9]] // CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT10]], i16 1) -// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[A_ADDR]], align 4 // CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 // CHECK3-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4 @@ -2235,7 +2157,6 @@ // CHECK3-NEXT: [[NVPTX_WARP_SIZE6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK3-NEXT: [[THREAD_LIMIT7:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS5]], [[NVPTX_WARP_SIZE6]] // CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT7]], i16 1) -// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[A_ADDR]], align 4 // CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP6]], 1 // CHECK3-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4 @@ -2338,7 +2259,6 @@ // CHECK3-NEXT: [[NVPTX_WARP_SIZE7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK3-NEXT: [[THREAD_LIMIT8:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS6]], [[NVPTX_WARP_SIZE7]] // CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT8]], i16 1) -// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[B_ADDR]], align 4 // CHECK3-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP9]] to double // CHECK3-NEXT: [[ADD:%.*]] = fadd double [[CONV]], 1.500000e+00 @@ -2368,56 +2288,27 @@ // // // CHECK3-LABEL: define {{[^@]+}}@_Z3baziRd -// CHECK3-SAME: (i32 [[F3:%.*]], double* nonnull align 8 dereferenceable(8) [[A:%.*]]) #[[ATTR5]] { +// CHECK3-SAME: (i32 [[F1:%.*]], double* nonnull align 8 dereferenceable(8) [[A:%.*]]) #[[ATTR5]] { // CHECK3-NEXT: entry: -// CHECK3-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[F2:%.*]] = alloca i32, align 4 // CHECK3-NEXT: [[A_ADDR:%.*]] = alloca double*, align 4 // CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4 // CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) -// CHECK3-NEXT: [[TMP1:%.*]] = call i16 @__kmpc_parallel_level(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]]) -// CHECK3-NEXT: [[TMP2:%.*]] = icmp eq i16 [[TMP1]], 0 -// CHECK3-NEXT: [[TMP3:%.*]] = call i8 @__kmpc_is_spmd_exec_mode() #[[ATTR2]] -// CHECK3-NEXT: [[TMP4:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK3-NEXT: br i1 [[TMP4]], label [[DOTSPMD:%.*]], label [[DOTNON_SPMD:%.*]] -// CHECK3: .spmd: -// CHECK3-NEXT: br label [[DOTEXIT:%.*]] -// CHECK3: .non-spmd: -// CHECK3-NEXT: [[TMP5:%.*]] = select i1 [[TMP2]], i32 4, i32 128 -// CHECK3-NEXT: [[TMP6:%.*]] = call i8* @__kmpc_data_sharing_coalesced_push_stack(i32 [[TMP5]], i16 0) -// CHECK3-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to %struct._globalized_locals_ty* -// CHECK3-NEXT: br label [[DOTEXIT]] -// CHECK3: .exit: -// CHECK3-NEXT: [[_SELECT_STACK:%.*]] = phi %struct._globalized_locals_ty* [ null, [[DOTSPMD]] ], [ [[TMP7]], [[DOTNON_SPMD]] ] -// CHECK3-NEXT: [[TMP8:%.*]] = bitcast %struct._globalized_locals_ty* [[_SELECT_STACK]] to %struct._globalized_locals_ty.0* -// CHECK3-NEXT: [[F:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[_SELECT_STACK]], i32 0, i32 0 -// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK3-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID]], 31 -// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [32 x i32], [32 x i32]* [[F]], i32 0, i32 [[NVPTX_LANE_ID]] -// CHECK3-NEXT: [[F1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP8]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP10:%.*]] = select i1 [[TMP2]], i32* [[F1]], i32* [[TMP9]] -// CHECK3-NEXT: [[TMP11:%.*]] = select i1 [[TMP4]], i32* [[F2]], i32* [[TMP10]] -// CHECK3-NEXT: store i32 [[F3]], i32* [[TMP11]], align 4 +// CHECK3-NEXT: [[F:%.*]] = call i8* @__kmpc_alloc_shared(i32 4) +// CHECK3-NEXT: [[F_ON_STACK:%.*]] = bitcast i8* [[F]] to i32* +// CHECK3-NEXT: store i32 [[F1]], i32* [[F_ON_STACK]], align 4 // CHECK3-NEXT: store double* [[A]], double** [[A_ADDR]], align 4 -// CHECK3-NEXT: [[TMP12:%.*]] = load double*, double** [[A_ADDR]], align 4 -// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP11]] to i8* -// CHECK3-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 -// CHECK3-NEXT: [[TMP15:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP16:%.*]] = bitcast double* [[TMP12]] to i8* -// CHECK3-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 4 -// CHECK3-NEXT: [[TMP17:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, double*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP17]], i32 2) -// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP11]], align 4 -// CHECK3-NEXT: store i32 [[TMP18]], i32* [[RETVAL]], align 4 -// CHECK3-NEXT: br i1 [[TMP4]], label [[DOTEXIT5:%.*]], label [[DOTNON_SPMD4:%.*]] -// CHECK3: .non-spmd4: -// CHECK3-NEXT: [[TMP19:%.*]] = bitcast %struct._globalized_locals_ty* [[_SELECT_STACK]] to i8* -// CHECK3-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP19]]) -// CHECK3-NEXT: br label [[DOTEXIT5]] -// CHECK3: .exit5: -// CHECK3-NEXT: [[TMP20:%.*]] = load i32, i32* [[RETVAL]], align 4 -// CHECK3-NEXT: ret i32 [[TMP20]] +// CHECK3-NEXT: [[TMP1:%.*]] = load double*, double** [[A_ADDR]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP3:%.*]] = bitcast i32* [[F_ON_STACK]] to i8* +// CHECK3-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP5:%.*]] = bitcast double* [[TMP1]] to i8* +// CHECK3-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, double*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP6]], i32 2) +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[F_ON_STACK]], align 4 +// CHECK3-NEXT: call void @__kmpc_free_shared(i8* [[F]]) +// CHECK3-NEXT: ret i32 [[TMP7]] // // // CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__1 @@ -2527,7 +2418,6 @@ // CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK3-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] // CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK3-NEXT: call void @_Z6asserti(i32 0) #[[ATTR8:[0-9]+]] // CHECK3-NEXT: unreachable // CHECK3: 5: @@ -2610,7 +2500,6 @@ // CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK3-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] // CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[A_ADDR]], align 4 // CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP6]], 1 // CHECK3-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4 diff --git a/clang/test/OpenMP/nvptx_target_parallel_codegen.cpp b/clang/test/OpenMP/nvptx_target_parallel_codegen.cpp --- a/clang/test/OpenMP/nvptx_target_parallel_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_parallel_codegen.cpp @@ -60,7 +60,6 @@ // CHECK1-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 8 // CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK1: .execute: // CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) @@ -110,7 +109,6 @@ // CHECK1-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 // CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK1: .execute: // CHECK1-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) @@ -173,7 +171,6 @@ // CHECK2-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 // CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK2: .execute: // CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) @@ -223,7 +220,6 @@ // CHECK2-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 // CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK2: .execute: // CHECK2-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) @@ -286,7 +282,6 @@ // CHECK3-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 // CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK3: .execute: // CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) @@ -336,7 +331,6 @@ // CHECK3-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 // CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK3: .execute: // CHECK3-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) @@ -399,7 +393,6 @@ // CHECK4-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 8 // CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK4-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK4: .execute: // CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) @@ -449,7 +442,6 @@ // CHECK4-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 // CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK4-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK4: .execute: // CHECK4-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) @@ -512,7 +504,6 @@ // CHECK5-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 // CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK5-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK5-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK5-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK5: .execute: // CHECK5-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) @@ -562,7 +553,6 @@ // CHECK5-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 // CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK5-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK5-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK5-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK5: .execute: // CHECK5-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) @@ -625,7 +615,6 @@ // CHECK6-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 // CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK6-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK6-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK6-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK6: .execute: // CHECK6-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) @@ -675,7 +664,6 @@ // CHECK6-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 // CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK6-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK6-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK6-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK6: .execute: // CHECK6-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) diff --git a/clang/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp b/clang/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp --- a/clang/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp @@ -55,7 +55,6 @@ // CHECK1-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 8 // CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK1: .execute: // CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) @@ -108,7 +107,6 @@ // CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[DOTCAPTURE_EXPR__ADDR]] to i32* // CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK1: .execute: // CHECK1-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) @@ -172,7 +170,6 @@ // CHECK2-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 // CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK2: .execute: // CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) @@ -224,7 +221,6 @@ // CHECK2-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 // CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK2: .execute: // CHECK2-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) @@ -288,7 +284,6 @@ // CHECK3-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 // CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK3: .execute: // CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) @@ -340,7 +335,6 @@ // CHECK3-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 // CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK3: .execute: // CHECK3-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) @@ -404,7 +398,6 @@ // CHECK4-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 8 // CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK4-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK4: .execute: // CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) @@ -457,7 +450,6 @@ // CHECK4-NEXT: [[CONV:%.*]] = bitcast i64* [[DOTCAPTURE_EXPR__ADDR]] to i32* // CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK4-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK4: .execute: // CHECK4-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) @@ -521,7 +513,6 @@ // CHECK5-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 // CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK5-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK5-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK5-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK5: .execute: // CHECK5-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) @@ -573,7 +564,6 @@ // CHECK5-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 // CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK5-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK5-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK5-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK5: .execute: // CHECK5-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) @@ -637,7 +627,6 @@ // CHECK6-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 // CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK6-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK6-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK6-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK6: .execute: // CHECK6-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) @@ -689,7 +678,6 @@ // CHECK6-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 // CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK6-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK6-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK6-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK6: .execute: // CHECK6-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) diff --git a/clang/test/OpenMP/nvptx_target_parallel_proc_bind_codegen.cpp b/clang/test/OpenMP/nvptx_target_parallel_proc_bind_codegen.cpp --- a/clang/test/OpenMP/nvptx_target_parallel_proc_bind_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_parallel_proc_bind_codegen.cpp @@ -55,7 +55,6 @@ // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l29}}( // CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) -// CHECK: call void @__kmpc_data_sharing_init_stack_spmd // CHECK: br label {{%?}}[[EXEC:.+]] // // CHECK: [[EXEC]] @@ -73,7 +72,6 @@ // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l33}}( // CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) -// CHECK: call void @__kmpc_data_sharing_init_stack_spmd // CHECK: br label {{%?}}[[EXEC:.+]] // // CHECK: [[EXEC]] @@ -91,7 +89,6 @@ // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l38}}( // CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) -// CHECK: call void @__kmpc_data_sharing_init_stack_spmd // CHECK: br label {{%?}}[[EXEC:.+]] // // CHECK: [[EXEC]] diff --git a/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp --- a/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp @@ -55,7 +55,6 @@ // CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l27}}( // // CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) -// CHECK: call void @__kmpc_data_sharing_init_stack_spmd // CHECK: br label {{%?}}[[EXECUTE:.+]] // // CHECK: [[EXECUTE]] @@ -239,7 +238,6 @@ // CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l32}}( // // CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) -// CHECK: call void @__kmpc_data_sharing_init_stack_spmd // CHECK: br label {{%?}}[[EXECUTE:.+]] // // CHECK: [[EXECUTE]] @@ -501,7 +499,6 @@ // CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l38}}( // // CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) -// CHECK: call void @__kmpc_data_sharing_init_stack_spmd // CHECK: br label {{%?}}[[EXECUTE:.+]] // // CHECK: [[EXECUTE]] diff --git a/clang/test/OpenMP/nvptx_target_teams_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_codegen.cpp --- a/clang/test/OpenMP/nvptx_target_teams_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_teams_codegen.cpp @@ -94,9 +94,9 @@ // CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 // CHECK1-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 // CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i8* -// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG8:![0-9]+]] +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG9:![0-9]+]] +// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10:![0-9]+]] // CHECK1-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] // CHECK1-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] // CHECK1-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] @@ -104,9 +104,9 @@ // CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_worker() #[[ATTR3:[0-9]+]] // CHECK1-NEXT: br label [[DOTEXIT:%.*]] // CHECK1: .mastercheck: -// CHECK1-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK1-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG8]] +// CHECK1-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG9]] +// CHECK1-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10]] // CHECK1-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 // CHECK1-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 // CHECK1-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 @@ -114,11 +114,10 @@ // CHECK1-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] // CHECK1-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] // CHECK1: .master: -// CHECK1-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG9]] +// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10]] // CHECK1-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] // CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK1-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) // CHECK1-NEXT: [[TMP6:%.*]] = load i8, i8* [[CONV]], align 8 // CHECK1-NEXT: [[CONV7:%.*]] = bitcast i64* [[A_CASTED]] to i8* @@ -194,9 +193,9 @@ // CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 // CHECK1-NEXT: store i64 [[AA]], i64* [[AA_ADDR]], align 8 // CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[AA_ADDR]] to i16* -// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG8]] +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG9]] +// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10]] // CHECK1-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] // CHECK1-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] // CHECK1-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] @@ -204,9 +203,9 @@ // CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l28_worker() #[[ATTR3]] // CHECK1-NEXT: br label [[DOTEXIT:%.*]] // CHECK1: .mastercheck: -// CHECK1-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK1-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG8]] +// CHECK1-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG9]] +// CHECK1-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10]] // CHECK1-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 // CHECK1-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 // CHECK1-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 @@ -214,11 +213,10 @@ // CHECK1-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] // CHECK1-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] // CHECK1: .master: -// CHECK1-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG9]] +// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10]] // CHECK1-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] // CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK1-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) // CHECK1-NEXT: [[TMP6:%.*]] = load i16, i16* [[CONV]], align 8 // CHECK1-NEXT: [[CONV7:%.*]] = bitcast i64* [[AA_CASTED]] to i16* @@ -259,9 +257,8 @@ // CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 // CHECK1-NEXT: store i64 [[AA]], i64* [[AA_ADDR]], align 8 // CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[AA_ADDR]] to i16* -// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG9]] // CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK1: .execute: // CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) @@ -380,9 +377,9 @@ // CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 // CHECK2-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 // CHECK2-NEXT: [[CONV:%.*]] = bitcast i32* [[A_ADDR]] to i8* -// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG8:![0-9]+]] +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG9:![0-9]+]] +// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10:![0-9]+]] // CHECK2-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] // CHECK2-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] // CHECK2-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] @@ -390,9 +387,9 @@ // CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_worker() #[[ATTR3:[0-9]+]] // CHECK2-NEXT: br label [[DOTEXIT:%.*]] // CHECK2: .mastercheck: -// CHECK2-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK2-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG8]] +// CHECK2-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG9]] +// CHECK2-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10]] // CHECK2-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 // CHECK2-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 // CHECK2-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 @@ -400,11 +397,10 @@ // CHECK2-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] // CHECK2-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] // CHECK2: .master: -// CHECK2-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG9]] +// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10]] // CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] // CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK2-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) // CHECK2-NEXT: [[TMP6:%.*]] = load i8, i8* [[CONV]], align 4 // CHECK2-NEXT: [[CONV7:%.*]] = bitcast i32* [[A_CASTED]] to i8* @@ -480,9 +476,9 @@ // CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 // CHECK2-NEXT: store i32 [[AA]], i32* [[AA_ADDR]], align 4 // CHECK2-NEXT: [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16* -// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG8]] +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG9]] +// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10]] // CHECK2-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] // CHECK2-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] // CHECK2-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] @@ -490,9 +486,9 @@ // CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l28_worker() #[[ATTR3]] // CHECK2-NEXT: br label [[DOTEXIT:%.*]] // CHECK2: .mastercheck: -// CHECK2-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK2-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG8]] +// CHECK2-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG9]] +// CHECK2-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10]] // CHECK2-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 // CHECK2-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 // CHECK2-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 @@ -500,11 +496,10 @@ // CHECK2-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] // CHECK2-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] // CHECK2: .master: -// CHECK2-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG9]] +// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10]] // CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] // CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK2-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) // CHECK2-NEXT: [[TMP6:%.*]] = load i16, i16* [[CONV]], align 4 // CHECK2-NEXT: [[CONV7:%.*]] = bitcast i32* [[AA_CASTED]] to i16* @@ -545,9 +540,8 @@ // CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 // CHECK2-NEXT: store i32 [[AA]], i32* [[AA_ADDR]], align 4 // CHECK2-NEXT: [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16* -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG9]] // CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK2: .execute: // CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) @@ -666,9 +660,9 @@ // CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 // CHECK3-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 // CHECK3-NEXT: [[CONV:%.*]] = bitcast i32* [[A_ADDR]] to i8* -// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG8:![0-9]+]] +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG9:![0-9]+]] +// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10:![0-9]+]] // CHECK3-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] // CHECK3-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] // CHECK3-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] @@ -676,9 +670,9 @@ // CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_worker() #[[ATTR3:[0-9]+]] // CHECK3-NEXT: br label [[DOTEXIT:%.*]] // CHECK3: .mastercheck: -// CHECK3-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK3-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG8]] +// CHECK3-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG9]] +// CHECK3-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10]] // CHECK3-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 // CHECK3-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 // CHECK3-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 @@ -686,11 +680,10 @@ // CHECK3-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] // CHECK3-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] // CHECK3: .master: -// CHECK3-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG9]] +// CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10]] // CHECK3-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] // CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK3-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) // CHECK3-NEXT: [[TMP6:%.*]] = load i8, i8* [[CONV]], align 4 // CHECK3-NEXT: [[CONV7:%.*]] = bitcast i32* [[A_CASTED]] to i8* @@ -766,9 +759,9 @@ // CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 // CHECK3-NEXT: store i32 [[AA]], i32* [[AA_ADDR]], align 4 // CHECK3-NEXT: [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16* -// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG8]] +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG9]] +// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10]] // CHECK3-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] // CHECK3-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] // CHECK3-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] @@ -776,9 +769,9 @@ // CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l28_worker() #[[ATTR3]] // CHECK3-NEXT: br label [[DOTEXIT:%.*]] // CHECK3: .mastercheck: -// CHECK3-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK3-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG8]] +// CHECK3-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG9]] +// CHECK3-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10]] // CHECK3-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 // CHECK3-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 // CHECK3-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 @@ -786,11 +779,10 @@ // CHECK3-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] // CHECK3-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] // CHECK3: .master: -// CHECK3-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG9]] +// CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10]] // CHECK3-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] // CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK3-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) // CHECK3-NEXT: [[TMP6:%.*]] = load i16, i16* [[CONV]], align 4 // CHECK3-NEXT: [[CONV7:%.*]] = bitcast i32* [[AA_CASTED]] to i16* @@ -831,9 +823,8 @@ // CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 // CHECK3-NEXT: store i32 [[AA]], i32* [[AA_ADDR]], align 4 // CHECK3-NEXT: [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16* -// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG9]] // CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK3: .execute: // CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) diff --git a/clang/test/OpenMP/nvptx_target_teams_distribute_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_distribute_codegen.cpp --- a/clang/test/OpenMP/nvptx_target_teams_distribute_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_teams_distribute_codegen.cpp @@ -2,12 +2,9 @@ // Test target codegen - host bc file has to be created first. // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK1 -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK2 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK3 -// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK4 -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK5 -// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK6 +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK2 +// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK3 // expected-no-diagnostics #ifndef HEADER #define HEADER @@ -35,583 +32,6 @@ } #endif -// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l19_worker -// CHECK1-SAME: () #[[ATTR0:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 -// CHECK1-NEXT: store i8* null, i8** [[WORK_FN]], align 8 -// CHECK1-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 -// CHECK1-NEXT: br label [[DOTAWAIT_WORK:%.*]] -// CHECK1: .await.work: -// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK1-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) -// CHECK1-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 -// CHECK1-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 -// CHECK1-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 -// CHECK1-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null -// CHECK1-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] -// CHECK1: .select.workers: -// CHECK1-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 -// CHECK1-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK1-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] -// CHECK1: .execute.parallel: -// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) -// CHECK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 8 -// CHECK1-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*) -// CHECK1-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] -// CHECK1: .execute.fn: -// CHECK1-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3:[0-9]+]] -// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] -// CHECK1: .check.next: -// CHECK1-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* -// CHECK1-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) -// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL]] -// CHECK1: .terminate.parallel: -// CHECK1-NEXT: call void @__kmpc_kernel_end_parallel() -// CHECK1-NEXT: br label [[DOTBARRIER_PARALLEL]] -// CHECK1: .barrier.parallel: -// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK1-NEXT: br label [[DOTAWAIT_WORK]] -// CHECK1: .exit: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l19 -// CHECK1-SAME: () #[[ATTR1:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK1-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] -// CHECK1-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] -// CHECK1-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] -// CHECK1: .worker: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l19_worker() #[[ATTR3]] -// CHECK1-NEXT: br label [[DOTEXIT:%.*]] -// CHECK1: .mastercheck: -// CHECK1-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK1-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK1-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 -// CHECK1-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 -// CHECK1-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 -// CHECK1-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] -// CHECK1-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] -// CHECK1-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] -// CHECK1: .master: -// CHECK1-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK1-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] -// CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK1-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) -// CHECK1-NEXT: store i32 [[TMP5]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK1-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] -// CHECK1-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] -// CHECK1: .termination.notifier: -// CHECK1-NEXT: call void @__kmpc_kernel_deinit(i16 1) -// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK1-NEXT: br label [[DOTEXIT]] -// CHECK1: .exit: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8 -// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK1-NEXT: [[TMP1:%.*]] = load i64, i64* @"_openmp_static_kernel$size", align 8 -// CHECK1-NEXT: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i64 [[TMP1]], i16 [[TMP0]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) -// CHECK1-NEXT: [[TMP2:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 8 -// CHECK1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i64 0 -// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty* -// CHECK1-NEXT: [[I:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP4]], i32 0, i32 0 -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 -// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP6]], i32 92, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP7]], 9 -// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK1: cond.true: -// CHECK1-NEXT: br label [[COND_END:%.*]] -// CHECK1: cond.false: -// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: br label [[COND_END]] -// CHECK1: cond.end: -// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP8]], [[COND_FALSE]] ] -// CHECK1-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK1: omp.inner.for.cond: -// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP10]], [[TMP11]] -// CHECK1-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK1: omp.inner.for.body: -// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 -// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK1-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP14:%.*]] = bitcast i32* [[I]] to i8* -// CHECK1-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 -// CHECK1-NEXT: [[TMP15:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP6]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP15]], i64 1) -// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK1: omp.body.continue: -// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK1: omp.inner.for.inc: -// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP16]], 1 -// CHECK1-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK1: omp.inner.for.end: -// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK1: omp.loop.exit: -// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]]) -// CHECK1-NEXT: [[TMP17:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK1-NEXT: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP17]]) -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[I:%.*]]) #[[ATTR1]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[I_ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[I]], i32** [[I_ADDR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[I_ADDR]], align 8 -// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 -// CHECK1-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1 -// CHECK1-NEXT: store i32 [[INC]], i32* [[TMP0]], align 4 -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper -// CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 -// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK1-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -// CHECK1-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8 -// CHECK1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 0 -// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** -// CHECK1-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 8 -// CHECK1-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]] -// CHECK1-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l19_worker -// CHECK2-SAME: () #[[ATTR0:[0-9]+]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 -// CHECK2-NEXT: store i8* null, i8** [[WORK_FN]], align 8 -// CHECK2-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 -// CHECK2-NEXT: br label [[DOTAWAIT_WORK:%.*]] -// CHECK2: .await.work: -// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK2-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) -// CHECK2-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 -// CHECK2-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 -// CHECK2-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 -// CHECK2-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null -// CHECK2-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] -// CHECK2: .select.workers: -// CHECK2-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 -// CHECK2-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK2-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] -// CHECK2: .execute.parallel: -// CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) -// CHECK2-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 8 -// CHECK2-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*) -// CHECK2-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] -// CHECK2: .execute.fn: -// CHECK2-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3:[0-9]+]] -// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] -// CHECK2: .check.next: -// CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* -// CHECK2-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) -// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL]] -// CHECK2: .terminate.parallel: -// CHECK2-NEXT: call void @__kmpc_kernel_end_parallel() -// CHECK2-NEXT: br label [[DOTBARRIER_PARALLEL]] -// CHECK2: .barrier.parallel: -// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK2-NEXT: br label [[DOTAWAIT_WORK]] -// CHECK2: .exit: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l19 -// CHECK2-SAME: () #[[ATTR1:[0-9]+]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK2-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] -// CHECK2-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] -// CHECK2-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] -// CHECK2: .worker: -// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l19_worker() #[[ATTR3]] -// CHECK2-NEXT: br label [[DOTEXIT:%.*]] -// CHECK2: .mastercheck: -// CHECK2-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK2-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK2-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 -// CHECK2-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 -// CHECK2-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 -// CHECK2-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] -// CHECK2-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] -// CHECK2-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] -// CHECK2: .master: -// CHECK2-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] -// CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK2-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) -// CHECK2-NEXT: store i32 [[TMP5]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK2-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] -// CHECK2-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] -// CHECK2: .termination.notifier: -// CHECK2-NEXT: call void @__kmpc_kernel_deinit(i16 1) -// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK2-NEXT: br label [[DOTEXIT]] -// CHECK2: .exit: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8 -// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP0:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i64 4, i16 1) -// CHECK2-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to %struct._globalized_locals_ty* -// CHECK2-NEXT: [[I:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP1]], i32 0, i32 0 -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK2-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4 -// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP3]], i32 92, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 9 -// CHECK2-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK2: cond.true: -// CHECK2-NEXT: br label [[COND_END:%.*]] -// CHECK2: cond.false: -// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: br label [[COND_END]] -// CHECK2: cond.end: -// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ] -// CHECK2-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK2-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK2: omp.inner.for.cond: -// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]] -// CHECK2-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK2: omp.inner.for.body: -// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1 -// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK2-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP11:%.*]] = bitcast i32* [[I]] to i8* -// CHECK2-NEXT: store i8* [[TMP11]], i8** [[TMP10]], align 8 -// CHECK2-NEXT: [[TMP12:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP12]], i64 1) -// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK2: omp.body.continue: -// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK2: omp.inner.for.inc: -// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP13]], 1 -// CHECK2-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK2: omp.inner.for.end: -// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK2: omp.loop.exit: -// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP3]]) -// CHECK2-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP0]]) -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[I:%.*]]) #[[ATTR1]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[I_ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[I]], i32** [[I_ADDR]], align 8 -// CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[I_ADDR]], align 8 -// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 -// CHECK2-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1 -// CHECK2-NEXT: store i32 [[INC]], i32* [[TMP0]], align 4 -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper -// CHECK2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 -// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK2-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -// CHECK2-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8 -// CHECK2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 0 -// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** -// CHECK2-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 8 -// CHECK2-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]] -// CHECK2-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l19_worker -// CHECK3-SAME: () #[[ATTR0:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 -// CHECK3-NEXT: store i8* null, i8** [[WORK_FN]], align 4 -// CHECK3-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 -// CHECK3-NEXT: br label [[DOTAWAIT_WORK:%.*]] -// CHECK3: .await.work: -// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK3-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) -// CHECK3-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 -// CHECK3-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 -// CHECK3-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 -// CHECK3-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null -// CHECK3-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] -// CHECK3: .select.workers: -// CHECK3-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 -// CHECK3-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK3-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] -// CHECK3: .execute.parallel: -// CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) -// CHECK3-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 -// CHECK3-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*) -// CHECK3-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] -// CHECK3: .execute.fn: -// CHECK3-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3:[0-9]+]] -// CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] -// CHECK3: .check.next: -// CHECK3-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* -// CHECK3-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) -// CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL]] -// CHECK3: .terminate.parallel: -// CHECK3-NEXT: call void @__kmpc_kernel_end_parallel() -// CHECK3-NEXT: br label [[DOTBARRIER_PARALLEL]] -// CHECK3: .barrier.parallel: -// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK3-NEXT: br label [[DOTAWAIT_WORK]] -// CHECK3: .exit: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l19 -// CHECK3-SAME: () #[[ATTR1:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK3-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] -// CHECK3-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] -// CHECK3-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] -// CHECK3: .worker: -// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l19_worker() #[[ATTR3]] -// CHECK3-NEXT: br label [[DOTEXIT:%.*]] -// CHECK3: .mastercheck: -// CHECK3-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK3-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK3-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 -// CHECK3-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 -// CHECK3-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 -// CHECK3-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] -// CHECK3-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] -// CHECK3-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] -// CHECK3: .master: -// CHECK3-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK3-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] -// CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK3-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) -// CHECK3-NEXT: store i32 [[TMP5]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK3-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] -// CHECK3-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] -// CHECK3: .termination.notifier: -// CHECK3-NEXT: call void @__kmpc_kernel_deinit(i16 1) -// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK3-NEXT: br label [[DOTEXIT]] -// CHECK3: .exit: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 -// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* @"_openmp_static_kernel$size", align 4 -// CHECK3-NEXT: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP1]], i16 [[TMP0]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) -// CHECK3-NEXT: [[TMP2:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 0 -// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty* -// CHECK3-NEXT: [[I:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP4]], i32 0, i32 0 -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 -// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP6]], i32 92, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP7]], 9 -// CHECK3-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK3: cond.true: -// CHECK3-NEXT: br label [[COND_END:%.*]] -// CHECK3: cond.false: -// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: br label [[COND_END]] -// CHECK3: cond.end: -// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP8]], [[COND_FALSE]] ] -// CHECK3-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK3: omp.inner.for.cond: -// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP10]], [[TMP11]] -// CHECK3-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK3: omp.inner.for.body: -// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 -// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK3-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP14:%.*]] = bitcast i32* [[I]] to i8* -// CHECK3-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 -// CHECK3-NEXT: [[TMP15:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP6]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP15]], i32 1) -// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK3: omp.body.continue: -// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK3: omp.inner.for.inc: -// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP16]], 1 -// CHECK3-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK3: omp.inner.for.end: -// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK3: omp.loop.exit: -// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]]) -// CHECK3-NEXT: [[TMP17:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK3-NEXT: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP17]]) -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[I:%.*]]) #[[ATTR1]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: [[I_ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK3-NEXT: store i32* [[I]], i32** [[I_ADDR]], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = load i32*, i32** [[I_ADDR]], align 4 -// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 -// CHECK3-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1 -// CHECK3-NEXT: store i32 [[INC]], i32* [[TMP0]], align 4 -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper -// CHECK3-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK3-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK3-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -// CHECK3-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i32 0 -// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** -// CHECK3-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 4 -// CHECK3-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]] -// CHECK3-NEXT: ret void -// -// // CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l19_worker // CHECK4-SAME: () #[[ATTR0:[0-9]+]] { // CHECK4-NEXT: entry: @@ -652,8 +72,6 @@ // CHECK4-NEXT: br label [[DOTAWAIT_WORK]] // CHECK4: .exit: // CHECK4-NEXT: ret void -// -// // CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l19 // CHECK4-SAME: () #[[ATTR1:[0-9]+]] { // CHECK4-NEXT: entry: @@ -695,8 +113,6 @@ // CHECK4-NEXT: br label [[DOTEXIT]] // CHECK4: .exit: // CHECK4-NEXT: ret void -// -// // CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__ // CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { // CHECK4-NEXT: entry: @@ -769,417 +185,952 @@ // CHECK4-NEXT: [[TMP17:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 // CHECK4-NEXT: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP17]]) // CHECK4-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[I:%.*]]) #[[ATTR1]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[I_ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[I]], i32** [[I_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load i32*, i32** [[I_ADDR]], align 4 +// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK4-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1 +// CHECK4-NEXT: store i32 [[INC]], i32* [[TMP0]], align 4 +// CHECK4-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper +// CHECK4-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK4-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK4-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i32 0 +// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** +// CHECK4-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 4 +// CHECK4-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]] +// CHECK4-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l19_worker +// CHECK5-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK5-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK5-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK5-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK5: .await.work: +// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK5-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK5-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK5-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK5-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK5-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK5-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK5: .select.workers: +// CHECK5-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK5-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK5-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK5: .execute.parallel: +// CHECK5-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) +// CHECK5-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK5-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*) +// CHECK5-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] +// CHECK5: .execute.fn: +// CHECK5-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3:[0-9]+]] +// CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK5: .check.next: +// CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK5-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) +// CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK5: .terminate.parallel: +// CHECK5-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK5-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK5: .barrier.parallel: +// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK5-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK5: .exit: +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l19 +// CHECK5-SAME: () #[[ATTR1:[0-9]+]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK5-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK5-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK5-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK5: .worker: +// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l19_worker() #[[ATTR3]] +// CHECK5-NEXT: br label [[DOTEXIT:%.*]] +// CHECK5: .mastercheck: +// CHECK5-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK5-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK5-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK5-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK5-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK5-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK5-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK5: .master: +// CHECK5-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK5-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK5-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK5-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK5-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK5-NEXT: store i32 [[TMP5]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK5-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +// CHECK5-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK5: .termination.notifier: +// CHECK5-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK5-NEXT: br label [[DOTEXIT]] +// CHECK5: .exit: +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 4, i16 1) +// CHECK5-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to %struct._globalized_locals_ty* +// CHECK5-NEXT: [[I:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP1]], i32 0, i32 0 +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP3]], i32 92, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 9 +// CHECK5-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK5: cond.true: +// CHECK5-NEXT: br label [[COND_END:%.*]] +// CHECK5: cond.false: +// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: br label [[COND_END]] +// CHECK5: cond.end: +// CHECK5-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ] +// CHECK5-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK5: omp.inner.for.cond: +// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]] +// CHECK5-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK5: omp.inner.for.body: +// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1 +// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK5-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK5-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP11:%.*]] = bitcast i32* [[I]] to i8* +// CHECK5-NEXT: store i8* [[TMP11]], i8** [[TMP10]], align 4 +// CHECK5-NEXT: [[TMP12:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP12]], i32 1) +// CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK5: omp.body.continue: +// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK5: omp.inner.for.inc: +// CHECK5-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK5-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK5: omp.inner.for.end: +// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK5: omp.loop.exit: +// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP3]]) +// CHECK5-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP0]]) +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[I:%.*]]) #[[ATTR1]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[I_ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[I]], i32** [[I_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load i32*, i32** [[I_ADDR]], align 4 +// CHECK5-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK5-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1 +// CHECK5-NEXT: store i32 [[INC]], i32* [[TMP0]], align 4 +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper +// CHECK5-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK5-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK5-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i32 0 +// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** +// CHECK5-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 4 +// CHECK5-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]] +// CHECK5-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l19_worker +// CHECK6-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK6-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK6-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK6-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK6: .await.work: +// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK6-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK6-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK6-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK6-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK6-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK6-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK6: .select.workers: +// CHECK6-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK6-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK6-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK6: .execute.parallel: +// CHECK6-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) +// CHECK6-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK6-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*) +// CHECK6-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] +// CHECK6: .execute.fn: +// CHECK6-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3:[0-9]+]] +// CHECK6-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK6: .check.next: +// CHECK6-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK6-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) +// CHECK6-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK6: .terminate.parallel: +// CHECK6-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK6-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK6: .barrier.parallel: +// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK6-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK6: .exit: +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l19 +// CHECK6-SAME: () #[[ATTR1:[0-9]+]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK6-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK6-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK6-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK6-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK6: .worker: +// CHECK6-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l19_worker() #[[ATTR3]] +// CHECK6-NEXT: br label [[DOTEXIT:%.*]] +// CHECK6: .mastercheck: +// CHECK6-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK6-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK6-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK6-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK6-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK6-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK6-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK6-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK6: .master: +// CHECK6-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK6-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK6-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK6-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK6-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK6-NEXT: store i32 [[TMP5]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK6-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +// CHECK6-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK6: .termination.notifier: +// CHECK6-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK6-NEXT: br label [[DOTEXIT]] +// CHECK6: .exit: +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 4, i16 1) +// CHECK6-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to %struct._globalized_locals_ty* +// CHECK6-NEXT: [[I:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP1]], i32 0, i32 0 +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP3]], i32 92, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 9 +// CHECK6-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK6: cond.true: +// CHECK6-NEXT: br label [[COND_END:%.*]] +// CHECK6: cond.false: +// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: br label [[COND_END]] +// CHECK6: cond.end: +// CHECK6-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ] +// CHECK6-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK6: omp.inner.for.cond: +// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]] +// CHECK6-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK6: omp.inner.for.body: +// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1 +// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK6-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK6-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP11:%.*]] = bitcast i32* [[I]] to i8* +// CHECK6-NEXT: store i8* [[TMP11]], i8** [[TMP10]], align 4 +// CHECK6-NEXT: [[TMP12:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP12]], i32 1) +// CHECK6-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK6: omp.body.continue: +// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK6: omp.inner.for.inc: +// CHECK6-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK6-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK6: omp.inner.for.end: +// CHECK6-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK6: omp.loop.exit: +// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP3]]) +// CHECK6-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP0]]) +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[I:%.*]]) #[[ATTR1]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[I_ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[I]], i32** [[I_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load i32*, i32** [[I_ADDR]], align 4 +// CHECK6-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK6-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1 +// CHECK6-NEXT: store i32 [[INC]], i32* [[TMP0]], align 4 +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper +// CHECK6-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK6-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK6-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i32 0 +// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** +// CHECK6-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 4 +// CHECK6-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]] +// CHECK6-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l16_worker +// CHECK1-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK1-NEXT: store i8* null, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK1: .await.work: +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK1-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK1-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK1-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK1: .select.workers: +// CHECK1-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK1-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK1: .execute.parallel: +// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) +// CHECK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*) +// CHECK1-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] +// CHECK1: .execute.fn: +// CHECK1-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3:[0-9]+]] +// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK1: .check.next: +// CHECK1-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK1-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) +// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK1: .terminate.parallel: +// CHECK1-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK1-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK1: .barrier.parallel: +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void // // -// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[I:%.*]]) #[[ATTR1]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[I_ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK4-NEXT: store i32* [[I]], i32** [[I_ADDR]], align 4 -// CHECK4-NEXT: [[TMP0:%.*]] = load i32*, i32** [[I_ADDR]], align 4 -// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 -// CHECK4-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1 -// CHECK4-NEXT: store i32 [[INC]], i32* [[TMP0]], align 4 -// CHECK4-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l16 +// CHECK1-SAME: () #[[ATTR1:[0-9]+]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG4:![0-9]+]] +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG5:![0-9]+]] +// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG6:![0-9]+]] +// CHECK1-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK1-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK1-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK1: .worker: +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l16_worker() #[[ATTR3]] +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .mastercheck: +// CHECK1-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG4]] +// CHECK1-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG5]] +// CHECK1-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG6]] +// CHECK1-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK1-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK1-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK1-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK1-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK1-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK1: .master: +// CHECK1-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG5]] +// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG6]] +// CHECK1-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK1-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK1-NEXT: store i32 [[TMP5]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK1-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +// CHECK1-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK1: .termination.notifier: +// CHECK1-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: br label [[DOTEXIT]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void // // -// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper -// CHECK4-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 -// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK4-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK4-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -// CHECK4-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 4 -// CHECK4-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i32 0 -// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** -// CHECK4-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 4 -// CHECK4-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]] -// CHECK4-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: [[I:%.*]] = call i8* @__kmpc_alloc_shared(i64 4) +// CHECK1-NEXT: [[I_ON_STACK:%.*]] = bitcast i8* [[I]] to i32* +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP1]], i32 92, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9 +// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK1: cond.true: +// CHECK1-NEXT: br label [[COND_END:%.*]] +// CHECK1: cond.false: +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END]] +// CHECK1: cond.end: +// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ] +// CHECK1-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]] +// CHECK1-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK1-NEXT: store i32 [[ADD]], i32* [[I_ON_STACK]], align 4 +// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP9:%.*]] = bitcast i32* [[I_ON_STACK]] to i8* +// CHECK1-NEXT: store i8* [[TMP9]], i8** [[TMP8]], align 8 +// CHECK1-NEXT: [[TMP10:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP10]], i64 1) +// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK1: omp.body.continue: +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP11]], 1 +// CHECK1-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]]) +// CHECK1-NEXT: call void @__kmpc_free_shared(i8* [[I]]) +// CHECK1-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l19_worker -// CHECK5-SAME: () #[[ATTR0:[0-9]+]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 -// CHECK5-NEXT: store i8* null, i8** [[WORK_FN]], align 4 -// CHECK5-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 -// CHECK5-NEXT: br label [[DOTAWAIT_WORK:%.*]] -// CHECK5: .await.work: -// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK5-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) -// CHECK5-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 -// CHECK5-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 -// CHECK5-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 -// CHECK5-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null -// CHECK5-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] -// CHECK5: .select.workers: -// CHECK5-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 -// CHECK5-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK5-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] -// CHECK5: .execute.parallel: -// CHECK5-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) -// CHECK5-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 -// CHECK5-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*) -// CHECK5-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] -// CHECK5: .execute.fn: -// CHECK5-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3:[0-9]+]] -// CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] -// CHECK5: .check.next: -// CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* -// CHECK5-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) -// CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL]] -// CHECK5: .terminate.parallel: -// CHECK5-NEXT: call void @__kmpc_kernel_end_parallel() -// CHECK5-NEXT: br label [[DOTBARRIER_PARALLEL]] -// CHECK5: .barrier.parallel: -// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK5-NEXT: br label [[DOTAWAIT_WORK]] -// CHECK5: .exit: -// CHECK5-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[I:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[I_ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[I]], i32** [[I_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[I_ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK1-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1 +// CHECK1-NEXT: store i32 [[INC]], i32* [[TMP0]], align 4 +// CHECK1-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l19 -// CHECK5-SAME: () #[[ATTR1:[0-9]+]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK5-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK5-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] -// CHECK5-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] -// CHECK5-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] -// CHECK5: .worker: -// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l19_worker() #[[ATTR3]] -// CHECK5-NEXT: br label [[DOTEXIT:%.*]] -// CHECK5: .mastercheck: -// CHECK5-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK5-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK5-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK5-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 -// CHECK5-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 -// CHECK5-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 -// CHECK5-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] -// CHECK5-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] -// CHECK5-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] -// CHECK5: .master: -// CHECK5-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK5-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK5-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] -// CHECK5-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK5-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK5-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) -// CHECK5-NEXT: store i32 [[TMP5]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK5-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] -// CHECK5-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] -// CHECK5: .termination.notifier: -// CHECK5-NEXT: call void @__kmpc_kernel_deinit(i16 1) -// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK5-NEXT: br label [[DOTEXIT]] -// CHECK5: .exit: -// CHECK5-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper +// CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK1-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 0 +// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** +// CHECK1-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 8 +// CHECK1-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]] +// CHECK1-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l16_worker +// CHECK2-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK2-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK2-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK2: .await.work: +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK2-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK2-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK2-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK2-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK2: .select.workers: +// CHECK2-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK2-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK2: .execute.parallel: +// CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) +// CHECK2-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK2-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*) +// CHECK2-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] +// CHECK2: .execute.fn: +// CHECK2-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3:[0-9]+]] +// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK2: .check.next: +// CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK2-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) +// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK2: .terminate.parallel: +// CHECK2-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK2-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK2: .barrier.parallel: +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 -// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK5-NEXT: [[TMP0:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 4, i16 1) -// CHECK5-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to %struct._globalized_locals_ty* -// CHECK5-NEXT: [[I:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP1]], i32 0, i32 0 -// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK5-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 -// CHECK5-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK5-NEXT: [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4 -// CHECK5-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP3]], i32 92, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK5-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 9 -// CHECK5-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK5: cond.true: -// CHECK5-NEXT: br label [[COND_END:%.*]] -// CHECK5: cond.false: -// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK5-NEXT: br label [[COND_END]] -// CHECK5: cond.end: -// CHECK5-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ] -// CHECK5-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 -// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK5-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK5: omp.inner.for.cond: -// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK5-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]] -// CHECK5-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK5: omp.inner.for.body: -// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1 -// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK5-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK5-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP11:%.*]] = bitcast i32* [[I]] to i8* -// CHECK5-NEXT: store i8* [[TMP11]], i8** [[TMP10]], align 4 -// CHECK5-NEXT: [[TMP12:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP12]], i32 1) -// CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK5: omp.body.continue: -// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK5: omp.inner.for.inc: -// CHECK5-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP13]], 1 -// CHECK5-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK5: omp.inner.for.end: -// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK5: omp.loop.exit: -// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP3]]) -// CHECK5-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP0]]) -// CHECK5-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l16 +// CHECK2-SAME: () #[[ATTR1:[0-9]+]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG4:![0-9]+]] +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG5:![0-9]+]] +// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG6:![0-9]+]] +// CHECK2-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK2-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK2-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK2: .worker: +// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l16_worker() #[[ATTR3]] +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .mastercheck: +// CHECK2-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG4]] +// CHECK2-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG5]] +// CHECK2-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG6]] +// CHECK2-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK2-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK2-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK2-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK2-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK2-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK2: .master: +// CHECK2-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG5]] +// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG6]] +// CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK2-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK2-NEXT: store i32 [[TMP5]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK2-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +// CHECK2-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK2: .termination.notifier: +// CHECK2-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: br label [[DOTEXIT]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[I:%.*]]) #[[ATTR1]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[I_ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32* [[I]], i32** [[I_ADDR]], align 4 -// CHECK5-NEXT: [[TMP0:%.*]] = load i32*, i32** [[I_ADDR]], align 4 -// CHECK5-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 -// CHECK5-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1 -// CHECK5-NEXT: store i32 [[INC]], i32* [[TMP0]], align 4 -// CHECK5-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK2-NEXT: [[I:%.*]] = call i8* @__kmpc_alloc_shared(i32 4) +// CHECK2-NEXT: [[I_ON_STACK:%.*]] = bitcast i8* [[I]] to i32* +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP1]], i32 92, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9 +// CHECK2-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK2: cond.true: +// CHECK2-NEXT: br label [[COND_END:%.*]] +// CHECK2: cond.false: +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END]] +// CHECK2: cond.end: +// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ] +// CHECK2-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]] +// CHECK2-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK2-NEXT: store i32 [[ADD]], i32* [[I_ON_STACK]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP9:%.*]] = bitcast i32* [[I_ON_STACK]] to i8* +// CHECK2-NEXT: store i8* [[TMP9]], i8** [[TMP8]], align 4 +// CHECK2-NEXT: [[TMP10:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP10]], i32 1) +// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK2: omp.body.continue: +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP11]], 1 +// CHECK2-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK2: omp.loop.exit: +// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]]) +// CHECK2-NEXT: call void @__kmpc_free_shared(i8* [[I]]) +// CHECK2-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper -// CHECK5-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 -// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK5-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -// CHECK5-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 4 -// CHECK5-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i32 0 -// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** -// CHECK5-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 4 -// CHECK5-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]] -// CHECK5-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[I:%.*]]) #[[ATTR1]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[I_ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[I]], i32** [[I_ADDR]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[I_ADDR]], align 4 +// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK2-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1 +// CHECK2-NEXT: store i32 [[INC]], i32* [[TMP0]], align 4 +// CHECK2-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l19_worker -// CHECK6-SAME: () #[[ATTR0:[0-9]+]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 -// CHECK6-NEXT: store i8* null, i8** [[WORK_FN]], align 4 -// CHECK6-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 -// CHECK6-NEXT: br label [[DOTAWAIT_WORK:%.*]] -// CHECK6: .await.work: -// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK6-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) -// CHECK6-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 -// CHECK6-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 -// CHECK6-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 -// CHECK6-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null -// CHECK6-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] -// CHECK6: .select.workers: -// CHECK6-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 -// CHECK6-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK6-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] -// CHECK6: .execute.parallel: -// CHECK6-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) -// CHECK6-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 -// CHECK6-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*) -// CHECK6-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] -// CHECK6: .execute.fn: -// CHECK6-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3:[0-9]+]] -// CHECK6-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] -// CHECK6: .check.next: -// CHECK6-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* -// CHECK6-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) -// CHECK6-NEXT: br label [[DOTTERMINATE_PARALLEL]] -// CHECK6: .terminate.parallel: -// CHECK6-NEXT: call void @__kmpc_kernel_end_parallel() -// CHECK6-NEXT: br label [[DOTBARRIER_PARALLEL]] -// CHECK6: .barrier.parallel: -// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK6-NEXT: br label [[DOTAWAIT_WORK]] -// CHECK6: .exit: -// CHECK6-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper +// CHECK2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK2-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i32 0 +// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** +// CHECK2-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 4 +// CHECK2-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]] +// CHECK2-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l16_worker +// CHECK3-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK3-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK3-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK3: .await.work: +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK3-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK3-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK3-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK3-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK3: .select.workers: +// CHECK3-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK3-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK3: .execute.parallel: +// CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) +// CHECK3-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK3-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*) +// CHECK3-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] +// CHECK3: .execute.fn: +// CHECK3-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3:[0-9]+]] +// CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK3: .check.next: +// CHECK3-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK3-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) +// CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK3: .terminate.parallel: +// CHECK3-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK3-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK3: .barrier.parallel: +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l19 -// CHECK6-SAME: () #[[ATTR1:[0-9]+]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK6-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK6-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK6-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] -// CHECK6-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] -// CHECK6-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] -// CHECK6: .worker: -// CHECK6-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l19_worker() #[[ATTR3]] -// CHECK6-NEXT: br label [[DOTEXIT:%.*]] -// CHECK6: .mastercheck: -// CHECK6-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK6-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK6-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK6-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 -// CHECK6-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 -// CHECK6-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 -// CHECK6-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] -// CHECK6-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] -// CHECK6-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] -// CHECK6: .master: -// CHECK6-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK6-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK6-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] -// CHECK6-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK6-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK6-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) -// CHECK6-NEXT: store i32 [[TMP5]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK6-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] -// CHECK6-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] -// CHECK6: .termination.notifier: -// CHECK6-NEXT: call void @__kmpc_kernel_deinit(i16 1) -// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK6-NEXT: br label [[DOTEXIT]] -// CHECK6: .exit: -// CHECK6-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l16 +// CHECK3-SAME: () #[[ATTR1:[0-9]+]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG4:![0-9]+]] +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG5:![0-9]+]] +// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG6:![0-9]+]] +// CHECK3-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK3-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK3-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK3: .worker: +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l16_worker() #[[ATTR3]] +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .mastercheck: +// CHECK3-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG4]] +// CHECK3-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG5]] +// CHECK3-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG6]] +// CHECK3-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK3-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK3-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK3-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK3-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK3-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK3: .master: +// CHECK3-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG5]] +// CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG6]] +// CHECK3-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK3-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK3-NEXT: store i32 [[TMP5]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK3-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +// CHECK3-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK3: .termination.notifier: +// CHECK3-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: br label [[DOTEXIT]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 -// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK6-NEXT: [[TMP0:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 4, i16 1) -// CHECK6-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to %struct._globalized_locals_ty* -// CHECK6-NEXT: [[I:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP1]], i32 0, i32 0 -// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK6-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 -// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK6-NEXT: [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4 -// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP3]], i32 92, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK6-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 9 -// CHECK6-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK6: cond.true: -// CHECK6-NEXT: br label [[COND_END:%.*]] -// CHECK6: cond.false: -// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK6-NEXT: br label [[COND_END]] -// CHECK6: cond.end: -// CHECK6-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ] -// CHECK6-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 -// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK6-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK6: omp.inner.for.cond: -// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK6-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]] -// CHECK6-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK6: omp.inner.for.body: -// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1 -// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK6-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK6-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP11:%.*]] = bitcast i32* [[I]] to i8* -// CHECK6-NEXT: store i8* [[TMP11]], i8** [[TMP10]], align 4 -// CHECK6-NEXT: [[TMP12:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP12]], i32 1) -// CHECK6-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK6: omp.body.continue: -// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK6: omp.inner.for.inc: -// CHECK6-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP13]], 1 -// CHECK6-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK6: omp.inner.for.end: -// CHECK6-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK6: omp.loop.exit: -// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP3]]) -// CHECK6-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP0]]) -// CHECK6-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: [[I:%.*]] = call i8* @__kmpc_alloc_shared(i32 4) +// CHECK3-NEXT: [[I_ON_STACK:%.*]] = bitcast i8* [[I]] to i32* +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP1]], i32 92, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9 +// CHECK3-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK3: cond.true: +// CHECK3-NEXT: br label [[COND_END:%.*]] +// CHECK3: cond.false: +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END]] +// CHECK3: cond.end: +// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ] +// CHECK3-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]] +// CHECK3-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK3-NEXT: store i32 [[ADD]], i32* [[I_ON_STACK]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP9:%.*]] = bitcast i32* [[I_ON_STACK]] to i8* +// CHECK3-NEXT: store i8* [[TMP9]], i8** [[TMP8]], align 4 +// CHECK3-NEXT: [[TMP10:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP10]], i32 1) +// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK3: omp.body.continue: +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP11]], 1 +// CHECK3-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK3: omp.loop.exit: +// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]]) +// CHECK3-NEXT: call void @__kmpc_free_shared(i8* [[I]]) +// CHECK3-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[I:%.*]]) #[[ATTR1]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[I_ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK6-NEXT: store i32* [[I]], i32** [[I_ADDR]], align 4 -// CHECK6-NEXT: [[TMP0:%.*]] = load i32*, i32** [[I_ADDR]], align 4 -// CHECK6-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 -// CHECK6-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1 -// CHECK6-NEXT: store i32 [[INC]], i32* [[TMP0]], align 4 -// CHECK6-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[I:%.*]]) #[[ATTR1]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[I_ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[I]], i32** [[I_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load i32*, i32** [[I_ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK3-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1 +// CHECK3-NEXT: store i32 [[INC]], i32* [[TMP0]], align 4 +// CHECK3-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper -// CHECK6-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 -// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK6-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -// CHECK6-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 4 -// CHECK6-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i32 0 -// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** -// CHECK6-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 4 -// CHECK6-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]] -// CHECK6-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper +// CHECK3-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK3-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK3-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i32 0 +// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** +// CHECK3-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 4 +// CHECK3-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]] +// CHECK3-NEXT: ret void // diff --git a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp --- a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp @@ -1,26 +1,11 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-function-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" --prefix-filecheck-ir-name _ // Test target codegen - host bc file has to be created first. // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK1 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK1 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -fopenmp-optimistic-collapse -o - | FileCheck %s --check-prefix CHECK2 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -fopenmp-optimistic-collapse -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK3 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK4 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK5 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK6 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK7 - -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK8 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK9 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -fopenmp-optimistic-collapse -o - | FileCheck %s --check-prefix CHECK10 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -fopenmp-optimistic-collapse -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK11 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK12 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK13 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK14 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK15 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK3 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK4 // expected-no-diagnostics #ifndef HEADER @@ -124,8 +109,6 @@ // CHECK-NEXT: br label [[DOTEXIT:%.*]] // CHECK: .exit: // CHECK-NEXT: ret void -// -// // CHECK-LABEL: define {{[^@]+}}@__omp_outlined__ // CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: @@ -285,8 +268,6 @@ // CHECK-NEXT: [[TMP56:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 // CHECK-NEXT: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[TMP56]]) // CHECK-NEXT: ret void -// -// // CHECK-LABEL: define {{[^@]+}}@__omp_outlined__1 // CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: @@ -420,8 +401,6 @@ // CHECK-NEXT: br label [[OMP_PRECOND_END]] // CHECK: omp.precond.end: // CHECK-NEXT: ret void -// -// // CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 // CHECK-SAME: (i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: @@ -452,8 +431,6 @@ // CHECK-NEXT: br label [[DOTEXIT:%.*]] // CHECK: .exit: // CHECK-NEXT: ret void -// -// // CHECK-LABEL: define {{[^@]+}}@__omp_outlined__2 // CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: @@ -586,8 +563,6 @@ // CHECK-NEXT: br label [[OMP_PRECOND_END]] // CHECK: omp.precond.end: // CHECK-NEXT: ret void -// -// // CHECK-LABEL: define {{[^@]+}}@__omp_outlined__3 // CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: @@ -681,8 +656,6 @@ // CHECK-NEXT: br label [[OMP_PRECOND_END]] // CHECK: omp.precond.end: // CHECK-NEXT: ret void -// -// // CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 // CHECK-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: @@ -705,8 +678,6 @@ // CHECK-NEXT: br label [[DOTEXIT:%.*]] // CHECK: .exit: // CHECK-NEXT: ret void -// -// // CHECK-LABEL: define {{[^@]+}}@__omp_outlined__4 // CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: @@ -800,8 +771,6 @@ // CHECK: omp.loop.exit: // CHECK-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) // CHECK-NEXT: ret void -// -// // CHECK-LABEL: define {{[^@]+}}@__omp_outlined__5 // CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: @@ -870,8 +839,6 @@ // CHECK: omp.loop.exit: // CHECK-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) // CHECK-NEXT: ret void -// -// // CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59 // CHECK-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: @@ -902,8 +869,6 @@ // CHECK-NEXT: br label [[DOTEXIT:%.*]] // CHECK: .exit: // CHECK-NEXT: ret void -// -// // CHECK-LABEL: define {{[^@]+}}@__omp_outlined__6 // CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: @@ -1011,8 +976,6 @@ // CHECK: omp.loop.exit: // CHECK-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) // CHECK-NEXT: ret void -// -// // CHECK-LABEL: define {{[^@]+}}@__omp_outlined__7 // CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: @@ -1105,8 +1068,6 @@ // CHECK: omp.loop.exit: // CHECK-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) // CHECK-NEXT: ret void -// -// // CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l67 // CHECK-SAME: (i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: @@ -1137,8 +1098,6 @@ // CHECK-NEXT: br label [[DOTEXIT:%.*]] // CHECK: .exit: // CHECK-NEXT: ret void -// -// // CHECK-LABEL: define {{[^@]+}}@__omp_outlined__8 // CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: @@ -1287,8 +1246,6 @@ // CHECK-NEXT: br label [[OMP_PRECOND_END]] // CHECK: omp.precond.end: // CHECK-NEXT: ret void -// -// // CHECK-LABEL: define {{[^@]+}}@__omp_outlined__9 // CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: @@ -1424,8 +1381,6 @@ // CHECK-NEXT: br label [[OMP_PRECOND_END]] // CHECK: omp.precond.end: // CHECK-NEXT: ret void -// -// // CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74 // CHECK-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: @@ -1459,8 +1414,6 @@ // CHECK-NEXT: br label [[DOTEXIT:%.*]] // CHECK: .exit: // CHECK-NEXT: ret void -// -// // CHECK-LABEL: define {{[^@]+}}@__omp_outlined__10 // CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: @@ -1599,8 +1552,6 @@ // CHECK-NEXT: br label [[OMP_PRECOND_END]] // CHECK: omp.precond.end: // CHECK-NEXT: ret void -// -// // CHECK-LABEL: define {{[^@]+}}@__omp_outlined__11 // CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: @@ -1697,12638 +1648,6 @@ // CHECK-NEXT: br label [[OMP_PRECOND_END]] // CHECK: omp.precond.end: // CHECK-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43 -// CHECK1-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 -// CHECK1-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[L_CASTED:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK1-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 -// CHECK1-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 -// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 -// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* -// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK1: .execute: -// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) -// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: [[CONV2:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK1-NEXT: store i32 [[TMP2]], i32* [[CONV2]], align 4 -// CHECK1-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[CONV1]], align 8 -// CHECK1-NEXT: [[CONV3:%.*]] = bitcast i64* [[L_CASTED]] to i32* -// CHECK1-NEXT: store i32 [[TMP4]], i32* [[CONV3]], align 4 -// CHECK1-NEXT: [[TMP5:%.*]] = load i64, i64* [[L_CASTED]], align 8 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK1-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i32]* [[TMP0]], i64 [[TMP5]]) #[[ATTR2:[0-9]+]] -// CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK1: .omp.deinit: -// CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK1-NEXT: br label [[DOTEXIT:%.*]] -// CHECK1: .exit: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 -// CHECK1-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[I5:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[L_CASTED:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 8 -// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK1-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 -// CHECK1-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 -// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 -// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* -// CHECK1-NEXT: [[TMP1:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i64 4, i16 1) -// CHECK1-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to %struct._globalized_locals_ty* -// CHECK1-NEXT: [[L2:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP2]], i32 0, i32 0 -// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: store i32 [[TMP3]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP4]], 0 -// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK1-NEXT: [[SUB4:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK1-NEXT: store i32 [[SUB4]], i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK1-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] -// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK1: omp.precond.then: -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK1-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP8]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) -// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK1-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]] -// CHECK1-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK1: cond.true: -// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK1-NEXT: br label [[COND_END:%.*]] -// CHECK1: cond.false: -// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: br label [[COND_END]] -// CHECK1: cond.end: -// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] -// CHECK1-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK1: omp.inner.for.cond: -// CHECK1-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP15]], 1 -// CHECK1-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP14]], [[ADD]] -// CHECK1-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK1: omp.inner.for.body: -// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 -// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP19:%.*]] = zext i32 [[TMP18]] to i64 -// CHECK1-NEXT: [[TMP20:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: [[CONV8:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK1-NEXT: store i32 [[TMP20]], i32* [[CONV8]], align 4 -// CHECK1-NEXT: [[TMP21:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK1-NEXT: [[TMP22:%.*]] = load i32, i32* [[CONV1]], align 8 -// CHECK1-NEXT: [[CONV9:%.*]] = bitcast i64* [[L_CASTED]] to i32* -// CHECK1-NEXT: store i32 [[TMP22]], i32* [[CONV9]], align 4 -// CHECK1-NEXT: [[TMP23:%.*]] = load i64, i64* [[L_CASTED]], align 8 -// CHECK1-NEXT: [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP17]] to i8* -// CHECK1-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 8 -// CHECK1-NEXT: [[TMP26:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP27:%.*]] = inttoptr i64 [[TMP19]] to i8* -// CHECK1-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 8 -// CHECK1-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK1-NEXT: [[TMP29:%.*]] = inttoptr i64 [[TMP21]] to i8* -// CHECK1-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 8 -// CHECK1-NEXT: [[TMP30:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 -// CHECK1-NEXT: [[TMP31:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* -// CHECK1-NEXT: store i8* [[TMP31]], i8** [[TMP30]], align 8 -// CHECK1-NEXT: [[TMP32:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 4 -// CHECK1-NEXT: [[TMP33:%.*]] = inttoptr i64 [[TMP23]] to i8* -// CHECK1-NEXT: store i8* [[TMP33]], i8** [[TMP32]], align 8 -// CHECK1-NEXT: [[TMP34:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP35:%.*]] = load i32, i32* [[TMP34]], align 4 -// CHECK1-NEXT: [[TMP36:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP35]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i32]*, i64)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP36]], i64 5) -// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK1: omp.inner.for.inc: -// CHECK1-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP37]], [[TMP38]] -// CHECK1-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP39]], [[TMP40]] -// CHECK1-NEXT: store i32 [[ADD11]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD12:%.*]] = add nsw i32 [[TMP41]], [[TMP42]] -// CHECK1-NEXT: store i32 [[ADD12]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK1-NEXT: [[CMP13:%.*]] = icmp sgt i32 [[TMP43]], [[TMP44]] -// CHECK1-NEXT: br i1 [[CMP13]], label [[COND_TRUE14:%.*]], label [[COND_FALSE15:%.*]] -// CHECK1: cond.true14: -// CHECK1-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK1-NEXT: br label [[COND_END16:%.*]] -// CHECK1: cond.false15: -// CHECK1-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: br label [[COND_END16]] -// CHECK1: cond.end16: -// CHECK1-NEXT: [[COND17:%.*]] = phi i32 [ [[TMP45]], [[COND_TRUE14]] ], [ [[TMP46]], [[COND_FALSE15]] ] -// CHECK1-NEXT: store i32 [[COND17]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: store i32 [[TMP47]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK1: omp.inner.for.end: -// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK1: omp.loop.exit: -// CHECK1-NEXT: [[TMP48:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP49:%.*]] = load i32, i32* [[TMP48]], align 4 -// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP49]]) -// CHECK1-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[TMP51:%.*]] = icmp ne i32 [[TMP50]], 0 -// CHECK1-NEXT: br i1 [[TMP51]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK1: .omp.lastprivate.then: -// CHECK1-NEXT: [[TMP52:%.*]] = load i32, i32* [[CONV1]], align 8 -// CHECK1-NEXT: store i32 [[TMP52]], i32* [[CONV1]], align 8 -// CHECK1-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK1: .omp.lastprivate.done: -// CHECK1-NEXT: br label [[OMP_PRECOND_END]] -// CHECK1: omp.precond.end: -// CHECK1-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP1]]) -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 -// CHECK1-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[I6:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK1-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 -// CHECK1-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 -// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 -// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* -// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK1-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK1-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK1-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK1: omp.precond.then: -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK1-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK1-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP5]] to i32 -// CHECK1-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK1-NEXT: [[CONV5:%.*]] = trunc i64 [[TMP6]] to i32 -// CHECK1-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: store i32 [[CONV5]], i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) -// CHECK1-NEXT: br label [[OMP_DISPATCH_COND:%.*]] -// CHECK1: omp.dispatch.cond: -// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: [[CONV7:%.*]] = sext i32 [[TMP9]] to i64 -// CHECK1-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK1-NEXT: [[CMP8:%.*]] = icmp ugt i64 [[CONV7]], [[TMP10]] -// CHECK1-NEXT: br i1 [[CMP8]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK1: cond.true: -// CHECK1-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK1-NEXT: br label [[COND_END:%.*]] -// CHECK1: cond.false: -// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: [[CONV9:%.*]] = sext i32 [[TMP12]] to i64 -// CHECK1-NEXT: br label [[COND_END]] -// CHECK1: cond.end: -// CHECK1-NEXT: [[COND:%.*]] = phi i64 [ [[TMP11]], [[COND_TRUE]] ], [ [[CONV9]], [[COND_FALSE]] ] -// CHECK1-NEXT: [[CONV10:%.*]] = trunc i64 [[COND]] to i32 -// CHECK1-NEXT: store i32 [[CONV10]], i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: [[CMP11:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] -// CHECK1-NEXT: br i1 [[CMP11]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] -// CHECK1: omp.dispatch.body: -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK1: omp.inner.for.cond: -// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: [[CMP12:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] -// CHECK1-NEXT: br i1 [[CMP12]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK1: omp.inner.for.body: -// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 -// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK1-NEXT: store i32 [[ADD]], i32* [[I6]], align 4 -// CHECK1-NEXT: [[TMP19:%.*]] = load i32, i32* [[I6]], align 4 -// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64 -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM]] -// CHECK1-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 -// CHECK1-NEXT: [[TMP20:%.*]] = load i32, i32* [[I6]], align 4 -// CHECK1-NEXT: store i32 [[TMP20]], i32* [[CONV1]], align 8 -// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK1: omp.body.continue: -// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK1: omp.inner.for.inc: -// CHECK1-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[ADD13:%.*]] = add nsw i32 [[TMP21]], 1 -// CHECK1-NEXT: store i32 [[ADD13]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK1: omp.inner.for.end: -// CHECK1-NEXT: br label [[OMP_DISPATCH_INC:%.*]] -// CHECK1: omp.dispatch.inc: -// CHECK1-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK1-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD15:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] -// CHECK1-NEXT: store i32 [[ADD15]], i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: br label [[OMP_DISPATCH_COND]] -// CHECK1: omp.dispatch.end: -// CHECK1-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 -// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) -// CHECK1-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 -// CHECK1-NEXT: br i1 [[TMP29]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK1: .omp.lastprivate.then: -// CHECK1-NEXT: [[TMP30:%.*]] = load i32, i32* [[CONV1]], align 8 -// CHECK1-NEXT: store i32 [[TMP30]], i32* [[CONV1]], align 8 -// CHECK1-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK1: .omp.lastprivate.done: -// CHECK1-NEXT: br label [[OMP_PRECOND_END]] -// CHECK1: omp.precond.end: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 -// CHECK1-SAME: (i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 -// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK1-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK1: .execute: -// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK1-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 -// CHECK1-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK1-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] -// CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK1: .omp.deinit: -// CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK1-NEXT: br label [[DOTEXIT:%.*]] -// CHECK1: .exit: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__2 -// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 -// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 -// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK1-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK1-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK1-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK1-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK1: omp.precond.then: -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK1-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 -// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK1-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] -// CHECK1-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK1: cond.true: -// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK1-NEXT: br label [[COND_END:%.*]] -// CHECK1: cond.false: -// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: br label [[COND_END]] -// CHECK1: cond.end: -// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] -// CHECK1-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK1: omp.inner.for.cond: -// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 -// CHECK1-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] -// CHECK1-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK1: omp.inner.for.body: -// CHECK1-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64 -// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 -// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: [[CONV6:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK1-NEXT: store i32 [[TMP18]], i32* [[CONV6]], align 4 -// CHECK1-NEXT: [[TMP19:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK1-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP21:%.*]] = inttoptr i64 [[TMP15]] to i8* -// CHECK1-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 8 -// CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP23:%.*]] = inttoptr i64 [[TMP17]] to i8* -// CHECK1-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 8 -// CHECK1-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK1-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP19]] to i8* -// CHECK1-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 8 -// CHECK1-NEXT: [[TMP26:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 -// CHECK1-NEXT: [[TMP27:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* -// CHECK1-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 8 -// CHECK1-NEXT: [[TMP28:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP29:%.*]] = load i32, i32* [[TMP28]], align 4 -// CHECK1-NEXT: [[TMP30:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP29]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP30]], i64 4) -// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK1: omp.inner.for.inc: -// CHECK1-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] -// CHECK1-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] -// CHECK1-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] -// CHECK1-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK1-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP37]], [[TMP38]] -// CHECK1-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] -// CHECK1: cond.true11: -// CHECK1-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK1-NEXT: br label [[COND_END13:%.*]] -// CHECK1: cond.false12: -// CHECK1-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: br label [[COND_END13]] -// CHECK1: cond.end13: -// CHECK1-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP39]], [[COND_TRUE11]] ], [ [[TMP40]], [[COND_FALSE12]] ] -// CHECK1-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: store i32 [[TMP41]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK1: omp.inner.for.end: -// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK1: omp.loop.exit: -// CHECK1-NEXT: [[TMP42:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP43:%.*]] = load i32, i32* [[TMP42]], align 4 -// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP43]]) -// CHECK1-NEXT: br label [[OMP_PRECOND_END]] -// CHECK1: omp.precond.end: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__3 -// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 -// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[I5:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK1-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK1-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK1-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK1-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK1: omp.precond.then: -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK1-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK1-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP5]] to i32 -// CHECK1-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK1-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP6]] to i32 -// CHECK1-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK1: omp.inner.for.cond: -// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[CONV6:%.*]] = sext i32 [[TMP10]] to i64 -// CHECK1-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK1-NEXT: [[CMP7:%.*]] = icmp ule i64 [[CONV6]], [[TMP11]] -// CHECK1-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK1: omp.inner.for.body: -// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 -// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK1-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 -// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[I5]], align 4 -// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP13]] to i64 -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i64 0, i64 [[IDXPROM]] -// CHECK1-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 -// CHECK1-NEXT: [[CONV8:%.*]] = sext i16 [[TMP14]] to i32 -// CHECK1-NEXT: [[ADD9:%.*]] = add nsw i32 [[CONV8]], 1 -// CHECK1-NEXT: [[CONV10:%.*]] = trunc i32 [[ADD9]] to i16 -// CHECK1-NEXT: store i16 [[CONV10]], i16* [[ARRAYIDX]], align 2 -// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK1: omp.body.continue: -// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK1: omp.inner.for.inc: -// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] -// CHECK1-NEXT: store i32 [[ADD11]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK1: omp.inner.for.end: -// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK1: omp.loop.exit: -// CHECK1-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 -// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) -// CHECK1-NEXT: br label [[OMP_PRECOND_END]] -// CHECK1: omp.precond.end: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 -// CHECK1-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 -// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK1-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 -// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK1: .execute: -// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK1-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] -// CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK1: .omp.deinit: -// CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK1-NEXT: br label [[DOTEXIT:%.*]] -// CHECK1: .exit: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__4 -// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 -// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 8 -// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 -// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 -// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK1: cond.true: -// CHECK1-NEXT: br label [[COND_END:%.*]] -// CHECK1: cond.false: -// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: br label [[COND_END]] -// CHECK1: cond.end: -// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] -// CHECK1-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK1: omp.inner.for.cond: -// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 -// CHECK1-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK1: omp.inner.for.body: -// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 -// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 -// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP8]] to i8* -// CHECK1-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 8 -// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP10]] to i8* -// CHECK1-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 -// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK1-NEXT: [[TMP16:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* -// CHECK1-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 -// CHECK1-NEXT: [[TMP17:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP17]], i64 3) -// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK1: omp.inner.for.inc: -// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] -// CHECK1-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] -// CHECK1-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK1-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP24]], 9 -// CHECK1-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] -// CHECK1: cond.true5: -// CHECK1-NEXT: br label [[COND_END7:%.*]] -// CHECK1: cond.false6: -// CHECK1-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: br label [[COND_END7]] -// CHECK1: cond.end7: -// CHECK1-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP25]], [[COND_FALSE6]] ] -// CHECK1-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: store i32 [[TMP26]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK1: omp.inner.for.end: -// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK1: omp.loop.exit: -// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__5 -// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 -// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK1-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK1-NEXT: [[CONV:%.*]] = trunc i64 [[TMP1]] to i32 -// CHECK1-NEXT: [[TMP2:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK1-NEXT: [[CONV1:%.*]] = trunc i64 [[TMP2]] to i32 -// CHECK1-NEXT: store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: store i32 [[CONV1]], i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 -// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK1: omp.inner.for.cond: -// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[CONV2:%.*]] = sext i32 [[TMP6]] to i64 -// CHECK1-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK1-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV2]], [[TMP7]] -// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK1: omp.inner.for.body: -// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 -// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK1-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 -// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP9]] to i64 -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM]] -// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -// CHECK1-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1 -// CHECK1-NEXT: store i32 [[ADD3]], i32* [[ARRAYIDX]], align 4 -// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK1: omp.body.continue: -// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK1: omp.inner.for.inc: -// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] -// CHECK1-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK1: omp.inner.for.end: -// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK1: omp.loop.exit: -// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59 -// CHECK1-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 -// CHECK1-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK1-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK1-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* -// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK1: .execute: -// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[F_CASTED]] to i32* -// CHECK1-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 -// CHECK1-NEXT: [[TMP3:%.*]] = load i64, i64* [[F_CASTED]], align 8 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK1-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i64 [[TMP3]]) #[[ATTR2]] -// CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK1: .omp.deinit: -// CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK1-NEXT: br label [[DOTEXIT:%.*]] -// CHECK1: .exit: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__6 -// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 -// CHECK1-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[K:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 -// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK1-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 -// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 -// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK1: cond.true: -// CHECK1-NEXT: br label [[COND_END:%.*]] -// CHECK1: cond.false: -// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: br label [[COND_END]] -// CHECK1: cond.end: -// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] -// CHECK1-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK1: omp.inner.for.cond: -// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 -// CHECK1-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK1: omp.inner.for.body: -// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 -// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 -// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: [[CONV3:%.*]] = bitcast i64* [[F_CASTED]] to i32* -// CHECK1-NEXT: store i32 [[TMP11]], i32* [[CONV3]], align 4 -// CHECK1-NEXT: [[TMP12:%.*]] = load i64, i64* [[F_CASTED]], align 8 -// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP8]] to i8* -// CHECK1-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 -// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP16:%.*]] = inttoptr i64 [[TMP10]] to i8* -// CHECK1-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 -// CHECK1-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK1-NEXT: [[TMP18:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* -// CHECK1-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 8 -// CHECK1-NEXT: [[TMP19:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 -// CHECK1-NEXT: [[TMP20:%.*]] = inttoptr i64 [[TMP12]] to i8* -// CHECK1-NEXT: store i8* [[TMP20]], i8** [[TMP19]], align 8 -// CHECK1-NEXT: [[TMP21:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, [10 x [10 x i32]]*, i64)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP21]], i64 4) -// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK1: omp.inner.for.inc: -// CHECK1-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK1-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] -// CHECK1-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP26]], [[TMP27]] -// CHECK1-NEXT: store i32 [[ADD5]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP28]], 99 -// CHECK1-NEXT: br i1 [[CMP6]], label [[COND_TRUE7:%.*]], label [[COND_FALSE8:%.*]] -// CHECK1: cond.true7: -// CHECK1-NEXT: br label [[COND_END9:%.*]] -// CHECK1: cond.false8: -// CHECK1-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: br label [[COND_END9]] -// CHECK1: cond.end9: -// CHECK1-NEXT: [[COND10:%.*]] = phi i32 [ 99, [[COND_TRUE7]] ], [ [[TMP29]], [[COND_FALSE8]] ] -// CHECK1-NEXT: store i32 [[COND10]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: store i32 [[TMP30]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK1: omp.inner.for.end: -// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK1: omp.loop.exit: -// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__7 -// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 -// CHECK1-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[K:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK1-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK1-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK1-NEXT: [[CONV2:%.*]] = trunc i64 [[TMP1]] to i32 -// CHECK1-NEXT: [[TMP2:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK1-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP2]] to i32 -// CHECK1-NEXT: store i32 [[CONV2]], i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 -// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK1: omp.inner.for.cond: -// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[CONV4:%.*]] = sext i32 [[TMP6]] to i64 -// CHECK1-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK1-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV4]], [[TMP7]] -// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK1: omp.inner.for.body: -// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 -// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 -// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK1-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[DIV5:%.*]] = sdiv i32 [[TMP10]], 10 -// CHECK1-NEXT: [[MUL6:%.*]] = mul nsw i32 [[DIV5]], 10 -// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL6]] -// CHECK1-NEXT: [[MUL7:%.*]] = mul nsw i32 [[SUB]], 1 -// CHECK1-NEXT: [[ADD8:%.*]] = add nsw i32 0, [[MUL7]] -// CHECK1-NEXT: store i32 [[ADD8]], i32* [[J]], align 4 -// CHECK1-NEXT: store i32 10, i32* [[K]], align 4 -// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 -// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 -// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: [[MUL9:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] -// CHECK1-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP11]], [[MUL9]] -// CHECK1-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 -// CHECK1-NEXT: [[ADD11:%.*]] = add nsw i32 [[ADD10]], [[TMP14]] -// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 -// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP15]] to i64 -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i64 0, i64 [[IDXPROM]] -// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 -// CHECK1-NEXT: [[IDXPROM12:%.*]] = sext i32 [[TMP16]] to i64 -// CHECK1-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i64 0, i64 [[IDXPROM12]] -// CHECK1-NEXT: store i32 [[ADD11]], i32* [[ARRAYIDX13]], align 4 -// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK1: omp.body.continue: -// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK1: omp.inner.for.inc: -// CHECK1-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] -// CHECK1-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK1: omp.inner.for.end: -// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK1: omp.loop.exit: -// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l67 -// CHECK1-SAME: (i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 -// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK1-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK1: .execute: -// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK1-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 -// CHECK1-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK1-NEXT: call void @__omp_outlined__8(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [10 x [10 x i32]]* [[TMP0]]) #[[ATTR2]] -// CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK1: .omp.deinit: -// CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK1-NEXT: br label [[DOTEXIT:%.*]] -// CHECK1: .exit: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__8 -// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 -// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[I10:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[J11:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 -// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK1-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 -// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK1-NEXT: [[CONV4:%.*]] = sext i32 [[DIV]] to i64 -// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK1-NEXT: [[SUB5:%.*]] = sub nsw i32 [[TMP4]], 0 -// CHECK1-NEXT: [[DIV6:%.*]] = sdiv i32 [[SUB5]], 1 -// CHECK1-NEXT: [[CONV7:%.*]] = sext i32 [[DIV6]] to i64 -// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV4]], [[CONV7]] -// CHECK1-NEXT: [[SUB8:%.*]] = sub nsw i64 [[MUL]], 1 -// CHECK1-NEXT: store i64 [[SUB8]], i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK1-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK1-NEXT: store i32 0, i32* [[J]], align 4 -// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] -// CHECK1-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK1: land.lhs.true: -// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK1-NEXT: [[CMP9:%.*]] = icmp slt i32 0, [[TMP6]] -// CHECK1-NEXT: br i1 [[CMP9]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] -// CHECK1: omp.precond.then: -// CHECK1-NEXT: store i64 0, i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK1-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK1-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK1-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: [[CONV12:%.*]] = zext i32 [[NVPTX_NUM_THREADS]] to i64 -// CHECK1-NEXT: [[TMP8:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4 -// CHECK1-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB1]], i32 [[TMP9]], i32 91, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_COMB_LB]], i64* [[DOTOMP_COMB_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 [[CONV12]]) -// CHECK1-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK1-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK1-NEXT: [[CMP13:%.*]] = icmp sgt i64 [[TMP10]], [[TMP11]] -// CHECK1-NEXT: br i1 [[CMP13]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK1: cond.true: -// CHECK1-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK1-NEXT: br label [[COND_END:%.*]] -// CHECK1: cond.false: -// CHECK1-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK1-NEXT: br label [[COND_END]] -// CHECK1: cond.end: -// CHECK1-NEXT: [[COND:%.*]] = phi i64 [ [[TMP12]], [[COND_TRUE]] ], [ [[TMP13]], [[COND_FALSE]] ] -// CHECK1-NEXT: store i64 [[COND]], i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK1-NEXT: [[TMP14:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK1-NEXT: store i64 [[TMP14]], i64* [[DOTOMP_IV]], align 8 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK1: omp.inner.for.cond: -// CHECK1-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK1-NEXT: [[TMP16:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK1-NEXT: [[ADD:%.*]] = add nsw i64 [[TMP16]], 1 -// CHECK1-NEXT: [[CMP14:%.*]] = icmp slt i64 [[TMP15]], [[ADD]] -// CHECK1-NEXT: br i1 [[CMP14]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK1: omp.inner.for.body: -// CHECK1-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK1-NEXT: [[TMP18:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK1-NEXT: [[TMP19:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: [[CONV15:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK1-NEXT: store i32 [[TMP19]], i32* [[CONV15]], align 4 -// CHECK1-NEXT: [[TMP20:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK1-NEXT: [[TMP21:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP22:%.*]] = inttoptr i64 [[TMP17]] to i8* -// CHECK1-NEXT: store i8* [[TMP22]], i8** [[TMP21]], align 8 -// CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP24:%.*]] = inttoptr i64 [[TMP18]] to i8* -// CHECK1-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 8 -// CHECK1-NEXT: [[TMP25:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK1-NEXT: [[TMP26:%.*]] = inttoptr i64 [[TMP20]] to i8* -// CHECK1-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 8 -// CHECK1-NEXT: [[TMP27:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 -// CHECK1-NEXT: [[TMP28:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* -// CHECK1-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 8 -// CHECK1-NEXT: [[TMP29:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP30:%.*]] = load i32, i32* [[TMP29]], align 4 -// CHECK1-NEXT: [[TMP31:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP30]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [10 x [10 x i32]]*)* @__omp_outlined__9 to i8*), i8* null, i8** [[TMP31]], i64 4) -// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK1: omp.inner.for.inc: -// CHECK1-NEXT: [[TMP32:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK1-NEXT: [[TMP33:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK1-NEXT: [[ADD16:%.*]] = add nsw i64 [[TMP32]], [[TMP33]] -// CHECK1-NEXT: store i64 [[ADD16]], i64* [[DOTOMP_IV]], align 8 -// CHECK1-NEXT: [[TMP34:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK1-NEXT: [[TMP35:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK1-NEXT: [[ADD17:%.*]] = add nsw i64 [[TMP34]], [[TMP35]] -// CHECK1-NEXT: store i64 [[ADD17]], i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK1-NEXT: [[TMP36:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK1-NEXT: [[TMP37:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK1-NEXT: [[ADD18:%.*]] = add nsw i64 [[TMP36]], [[TMP37]] -// CHECK1-NEXT: store i64 [[ADD18]], i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK1-NEXT: [[TMP38:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK1-NEXT: [[TMP39:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK1-NEXT: [[CMP19:%.*]] = icmp sgt i64 [[TMP38]], [[TMP39]] -// CHECK1-NEXT: br i1 [[CMP19]], label [[COND_TRUE20:%.*]], label [[COND_FALSE21:%.*]] -// CHECK1: cond.true20: -// CHECK1-NEXT: [[TMP40:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK1-NEXT: br label [[COND_END22:%.*]] -// CHECK1: cond.false21: -// CHECK1-NEXT: [[TMP41:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK1-NEXT: br label [[COND_END22]] -// CHECK1: cond.end22: -// CHECK1-NEXT: [[COND23:%.*]] = phi i64 [ [[TMP40]], [[COND_TRUE20]] ], [ [[TMP41]], [[COND_FALSE21]] ] -// CHECK1-NEXT: store i64 [[COND23]], i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK1-NEXT: [[TMP42:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK1-NEXT: store i64 [[TMP42]], i64* [[DOTOMP_IV]], align 8 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK1: omp.inner.for.end: -// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK1: omp.loop.exit: -// CHECK1-NEXT: [[TMP43:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP44:%.*]] = load i32, i32* [[TMP43]], align 4 -// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP44]]) -// CHECK1-NEXT: br label [[OMP_PRECOND_END]] -// CHECK1: omp.precond.end: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__9 -// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 -// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[I10:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[J11:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK1-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 -// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK1-NEXT: [[CONV4:%.*]] = sext i32 [[DIV]] to i64 -// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK1-NEXT: [[SUB5:%.*]] = sub nsw i32 [[TMP4]], 0 -// CHECK1-NEXT: [[DIV6:%.*]] = sdiv i32 [[SUB5]], 1 -// CHECK1-NEXT: [[CONV7:%.*]] = sext i32 [[DIV6]] to i64 -// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV4]], [[CONV7]] -// CHECK1-NEXT: [[SUB8:%.*]] = sub nsw i64 [[MUL]], 1 -// CHECK1-NEXT: store i64 [[SUB8]], i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK1-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK1-NEXT: store i32 0, i32* [[J]], align 4 -// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] -// CHECK1-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK1: land.lhs.true: -// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK1-NEXT: [[CMP9:%.*]] = icmp slt i32 0, [[TMP6]] -// CHECK1-NEXT: br i1 [[CMP9]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] -// CHECK1: omp.precond.then: -// CHECK1-NEXT: store i64 0, i64* [[DOTOMP_LB]], align 8 -// CHECK1-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK1-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_UB]], align 8 -// CHECK1-NEXT: [[TMP8:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK1-NEXT: [[TMP9:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK1-NEXT: store i64 [[TMP8]], i64* [[DOTOMP_LB]], align 8 -// CHECK1-NEXT: store i64 [[TMP9]], i64* [[DOTOMP_UB]], align 8 -// CHECK1-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 -// CHECK1-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB2]], i32 [[TMP11]], i32 33, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_LB]], i64* [[DOTOMP_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 1) -// CHECK1-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTOMP_LB]], align 8 -// CHECK1-NEXT: store i64 [[TMP12]], i64* [[DOTOMP_IV]], align 8 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK1: omp.inner.for.cond: -// CHECK1-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK1-NEXT: [[TMP14:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK1-NEXT: [[CMP12:%.*]] = icmp ule i64 [[TMP13]], [[TMP14]] -// CHECK1-NEXT: br i1 [[CMP12]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK1: omp.inner.for.body: -// CHECK1-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK1-NEXT: [[SUB13:%.*]] = sub nsw i32 [[TMP16]], 0 -// CHECK1-NEXT: [[DIV14:%.*]] = sdiv i32 [[SUB13]], 1 -// CHECK1-NEXT: [[MUL15:%.*]] = mul nsw i32 1, [[DIV14]] -// CHECK1-NEXT: [[CONV16:%.*]] = sext i32 [[MUL15]] to i64 -// CHECK1-NEXT: [[DIV17:%.*]] = sdiv i64 [[TMP15]], [[CONV16]] -// CHECK1-NEXT: [[MUL18:%.*]] = mul nsw i64 [[DIV17]], 1 -// CHECK1-NEXT: [[ADD:%.*]] = add nsw i64 0, [[MUL18]] -// CHECK1-NEXT: [[CONV19:%.*]] = trunc i64 [[ADD]] to i32 -// CHECK1-NEXT: store i32 [[CONV19]], i32* [[I10]], align 4 -// CHECK1-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK1-NEXT: [[TMP18:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK1-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK1-NEXT: [[SUB20:%.*]] = sub nsw i32 [[TMP19]], 0 -// CHECK1-NEXT: [[DIV21:%.*]] = sdiv i32 [[SUB20]], 1 -// CHECK1-NEXT: [[MUL22:%.*]] = mul nsw i32 1, [[DIV21]] -// CHECK1-NEXT: [[CONV23:%.*]] = sext i32 [[MUL22]] to i64 -// CHECK1-NEXT: [[DIV24:%.*]] = sdiv i64 [[TMP18]], [[CONV23]] -// CHECK1-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK1-NEXT: [[SUB25:%.*]] = sub nsw i32 [[TMP20]], 0 -// CHECK1-NEXT: [[DIV26:%.*]] = sdiv i32 [[SUB25]], 1 -// CHECK1-NEXT: [[MUL27:%.*]] = mul nsw i32 1, [[DIV26]] -// CHECK1-NEXT: [[CONV28:%.*]] = sext i32 [[MUL27]] to i64 -// CHECK1-NEXT: [[MUL29:%.*]] = mul nsw i64 [[DIV24]], [[CONV28]] -// CHECK1-NEXT: [[SUB30:%.*]] = sub nsw i64 [[TMP17]], [[MUL29]] -// CHECK1-NEXT: [[MUL31:%.*]] = mul nsw i64 [[SUB30]], 1 -// CHECK1-NEXT: [[ADD32:%.*]] = add nsw i64 0, [[MUL31]] -// CHECK1-NEXT: [[CONV33:%.*]] = trunc i64 [[ADD32]] to i32 -// CHECK1-NEXT: store i32 [[CONV33]], i32* [[J11]], align 4 -// CHECK1-NEXT: [[TMP21:%.*]] = load i32, i32* [[I10]], align 4 -// CHECK1-NEXT: [[TMP22:%.*]] = load i32, i32* [[J11]], align 4 -// CHECK1-NEXT: [[ADD34:%.*]] = add nsw i32 [[TMP21]], [[TMP22]] -// CHECK1-NEXT: [[TMP23:%.*]] = load i32, i32* [[I10]], align 4 -// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP23]] to i64 -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i64 0, i64 [[IDXPROM]] -// CHECK1-NEXT: [[TMP24:%.*]] = load i32, i32* [[J11]], align 4 -// CHECK1-NEXT: [[IDXPROM35:%.*]] = sext i32 [[TMP24]] to i64 -// CHECK1-NEXT: [[ARRAYIDX36:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i64 0, i64 [[IDXPROM35]] -// CHECK1-NEXT: store i32 [[ADD34]], i32* [[ARRAYIDX36]], align 4 -// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK1: omp.body.continue: -// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK1: omp.inner.for.inc: -// CHECK1-NEXT: [[TMP25:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK1-NEXT: [[TMP26:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK1-NEXT: [[ADD37:%.*]] = add nsw i64 [[TMP25]], [[TMP26]] -// CHECK1-NEXT: store i64 [[ADD37]], i64* [[DOTOMP_IV]], align 8 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK1: omp.inner.for.end: -// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK1: omp.loop.exit: -// CHECK1-NEXT: [[TMP27:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP27]], align 4 -// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP28]]) -// CHECK1-NEXT: br label [[OMP_PRECOND_END]] -// CHECK1: omp.precond.end: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74 -// CHECK1-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 -// CHECK1-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK1-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 -// CHECK1-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 8 -// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 -// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK1: .execute: -// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK1-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 -// CHECK1-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK1-NEXT: [[TMP4:%.*]] = load i32*, i32** [[V_ADDR]], align 8 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK1-NEXT: call void @__omp_outlined__10(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i32]* [[TMP0]], i32* [[TMP4]]) #[[ATTR2]] -// CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK1: .omp.deinit: -// CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK1-NEXT: br label [[DOTEXIT:%.*]] -// CHECK1: .exit: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__10 -// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 -// CHECK1-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 8 -// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK1-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 -// CHECK1-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 8 -// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 -// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK1-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK1-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK1-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK1: omp.precond.then: -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK1-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 -// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK1-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] -// CHECK1-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK1: cond.true: -// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK1-NEXT: br label [[COND_END:%.*]] -// CHECK1: cond.false: -// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: br label [[COND_END]] -// CHECK1: cond.end: -// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] -// CHECK1-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK1: omp.inner.for.cond: -// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 -// CHECK1-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] -// CHECK1-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK1: omp.inner.for.body: -// CHECK1-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64 -// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 -// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: [[CONV6:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK1-NEXT: store i32 [[TMP18]], i32* [[CONV6]], align 4 -// CHECK1-NEXT: [[TMP19:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK1-NEXT: [[TMP20:%.*]] = load i32*, i32** [[V_ADDR]], align 8 -// CHECK1-NEXT: [[TMP21:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP22:%.*]] = inttoptr i64 [[TMP15]] to i8* -// CHECK1-NEXT: store i8* [[TMP22]], i8** [[TMP21]], align 8 -// CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP24:%.*]] = inttoptr i64 [[TMP17]] to i8* -// CHECK1-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 8 -// CHECK1-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK1-NEXT: [[TMP26:%.*]] = inttoptr i64 [[TMP19]] to i8* -// CHECK1-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 8 -// CHECK1-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 -// CHECK1-NEXT: [[TMP28:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* -// CHECK1-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 8 -// CHECK1-NEXT: [[TMP29:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 4 -// CHECK1-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP20]] to i8* -// CHECK1-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 8 -// CHECK1-NEXT: [[TMP31:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP32:%.*]] = load i32, i32* [[TMP31]], align 4 -// CHECK1-NEXT: [[TMP33:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP32]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i32]*, i32*)* @__omp_outlined__11 to i8*), i8* null, i8** [[TMP33]], i64 5) -// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK1: omp.inner.for.inc: -// CHECK1-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP34]], [[TMP35]] -// CHECK1-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP36]], [[TMP37]] -// CHECK1-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP38]], [[TMP39]] -// CHECK1-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK1-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP40]], [[TMP41]] -// CHECK1-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] -// CHECK1: cond.true11: -// CHECK1-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK1-NEXT: br label [[COND_END13:%.*]] -// CHECK1: cond.false12: -// CHECK1-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: br label [[COND_END13]] -// CHECK1: cond.end13: -// CHECK1-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP42]], [[COND_TRUE11]] ], [ [[TMP43]], [[COND_FALSE12]] ] -// CHECK1-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: store i32 [[TMP44]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK1: omp.inner.for.end: -// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK1: omp.loop.exit: -// CHECK1-NEXT: [[TMP45:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP46:%.*]] = load i32, i32* [[TMP45]], align 4 -// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP46]]) -// CHECK1-NEXT: br label [[OMP_PRECOND_END]] -// CHECK1: omp.precond.end: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__11 -// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 -// CHECK1-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[I5:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK1-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 -// CHECK1-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 8 -// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 -// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK1-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK1-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK1-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK1: omp.precond.then: -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK1-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK1-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP5]] to i32 -// CHECK1-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK1-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP6]] to i32 -// CHECK1-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK1: omp.inner.for.cond: -// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[CONV6:%.*]] = sext i32 [[TMP10]] to i64 -// CHECK1-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK1-NEXT: [[CMP7:%.*]] = icmp ule i64 [[CONV6]], [[TMP11]] -// CHECK1-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK1: omp.inner.for.body: -// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 -// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK1-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 -// CHECK1-NEXT: [[TMP13:%.*]] = load i32*, i32** [[V_ADDR]], align 8 -// CHECK1-NEXT: [[TMP14:%.*]] = load i32, i32* [[I5]], align 4 -// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP14]] to i64 -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TMP13]], i64 [[IDXPROM]] -// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[I5]], align 4 -// CHECK1-NEXT: [[IDXPROM8:%.*]] = sext i32 [[TMP16]] to i64 -// CHECK1-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM8]] -// CHECK1-NEXT: store i32 [[TMP15]], i32* [[ARRAYIDX9]], align 4 -// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK1: omp.body.continue: -// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK1: omp.inner.for.inc: -// CHECK1-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] -// CHECK1-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK1: omp.inner.for.end: -// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK1: omp.loop.exit: -// CHECK1-NEXT: [[TMP19:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4 -// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP20]]) -// CHECK1-NEXT: br label [[OMP_PRECOND_END]] -// CHECK1: omp.precond.end: -// CHECK1-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43 -// CHECK2-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0:[0-9]+]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 -// CHECK2-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[L_CASTED:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK2-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 -// CHECK2-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 -// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 -// CHECK2-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK2: .execute: -// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) -// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: [[CONV2:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK2-NEXT: store i32 [[TMP2]], i32* [[CONV2]], align 4 -// CHECK2-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[CONV1]], align 8 -// CHECK2-NEXT: [[CONV3:%.*]] = bitcast i64* [[L_CASTED]] to i32* -// CHECK2-NEXT: store i32 [[TMP4]], i32* [[CONV3]], align 4 -// CHECK2-NEXT: [[TMP5:%.*]] = load i64, i64* [[L_CASTED]], align 8 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK2-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i32]* [[TMP0]], i64 [[TMP5]]) #[[ATTR2:[0-9]+]] -// CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK2: .omp.deinit: -// CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK2-NEXT: br label [[DOTEXIT:%.*]] -// CHECK2: .exit: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 -// CHECK2-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[I5:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[L_CASTED:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 8 -// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK2-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 -// CHECK2-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 -// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 -// CHECK2-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* -// CHECK2-NEXT: [[TMP1:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK2-NEXT: [[TMP2:%.*]] = load i64, i64* @"_openmp_static_kernel$size", align 8 -// CHECK2-NEXT: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i64 [[TMP2]], i16 [[TMP1]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) -// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 8 -// CHECK2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP3]], i64 0 -// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to %struct._globalized_locals_ty* -// CHECK2-NEXT: [[L2:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP5]], i32 0, i32 0 -// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: store i32 [[TMP6]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP7]], 0 -// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK2-NEXT: [[SUB4:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK2-NEXT: store i32 [[SUB4]], i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK2-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP8]] -// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK2: omp.precond.then: -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK2-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 -// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP11]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) -// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK2-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]] -// CHECK2-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK2: cond.true: -// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK2-NEXT: br label [[COND_END:%.*]] -// CHECK2: cond.false: -// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: br label [[COND_END]] -// CHECK2: cond.end: -// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ [[TMP14]], [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ] -// CHECK2-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: store i32 [[TMP16]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK2: omp.inner.for.cond: -// CHECK2-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], 1 -// CHECK2-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP17]], [[ADD]] -// CHECK2-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK2: omp.inner.for.body: -// CHECK2-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: [[TMP20:%.*]] = zext i32 [[TMP19]] to i64 -// CHECK2-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP22:%.*]] = zext i32 [[TMP21]] to i64 -// CHECK2-NEXT: [[TMP23:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: [[CONV8:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK2-NEXT: store i32 [[TMP23]], i32* [[CONV8]], align 4 -// CHECK2-NEXT: [[TMP24:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK2-NEXT: [[TMP25:%.*]] = load i32, i32* [[CONV1]], align 8 -// CHECK2-NEXT: [[CONV9:%.*]] = bitcast i64* [[L_CASTED]] to i32* -// CHECK2-NEXT: store i32 [[TMP25]], i32* [[CONV9]], align 4 -// CHECK2-NEXT: [[TMP26:%.*]] = load i64, i64* [[L_CASTED]], align 8 -// CHECK2-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP28:%.*]] = inttoptr i64 [[TMP20]] to i8* -// CHECK2-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 8 -// CHECK2-NEXT: [[TMP29:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP30:%.*]] = inttoptr i64 [[TMP22]] to i8* -// CHECK2-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 8 -// CHECK2-NEXT: [[TMP31:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK2-NEXT: [[TMP32:%.*]] = inttoptr i64 [[TMP24]] to i8* -// CHECK2-NEXT: store i8* [[TMP32]], i8** [[TMP31]], align 8 -// CHECK2-NEXT: [[TMP33:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 -// CHECK2-NEXT: [[TMP34:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* -// CHECK2-NEXT: store i8* [[TMP34]], i8** [[TMP33]], align 8 -// CHECK2-NEXT: [[TMP35:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 4 -// CHECK2-NEXT: [[TMP36:%.*]] = inttoptr i64 [[TMP26]] to i8* -// CHECK2-NEXT: store i8* [[TMP36]], i8** [[TMP35]], align 8 -// CHECK2-NEXT: [[TMP37:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP38:%.*]] = load i32, i32* [[TMP37]], align 4 -// CHECK2-NEXT: [[TMP39:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP38]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i32]*, i64)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP39]], i64 5) -// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK2: omp.inner.for.inc: -// CHECK2-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP40]], [[TMP41]] -// CHECK2-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP42]], [[TMP43]] -// CHECK2-NEXT: store i32 [[ADD11]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD12:%.*]] = add nsw i32 [[TMP44]], [[TMP45]] -// CHECK2-NEXT: store i32 [[ADD12]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK2-NEXT: [[CMP13:%.*]] = icmp sgt i32 [[TMP46]], [[TMP47]] -// CHECK2-NEXT: br i1 [[CMP13]], label [[COND_TRUE14:%.*]], label [[COND_FALSE15:%.*]] -// CHECK2: cond.true14: -// CHECK2-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK2-NEXT: br label [[COND_END16:%.*]] -// CHECK2: cond.false15: -// CHECK2-NEXT: [[TMP49:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: br label [[COND_END16]] -// CHECK2: cond.end16: -// CHECK2-NEXT: [[COND17:%.*]] = phi i32 [ [[TMP48]], [[COND_TRUE14]] ], [ [[TMP49]], [[COND_FALSE15]] ] -// CHECK2-NEXT: store i32 [[COND17]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: store i32 [[TMP50]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK2: omp.inner.for.end: -// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK2: omp.loop.exit: -// CHECK2-NEXT: [[TMP51:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP52:%.*]] = load i32, i32* [[TMP51]], align 4 -// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP52]]) -// CHECK2-NEXT: [[TMP53:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[TMP54:%.*]] = icmp ne i32 [[TMP53]], 0 -// CHECK2-NEXT: br i1 [[TMP54]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK2: .omp.lastprivate.then: -// CHECK2-NEXT: [[TMP55:%.*]] = load i32, i32* [[CONV1]], align 8 -// CHECK2-NEXT: store i32 [[TMP55]], i32* [[CONV1]], align 8 -// CHECK2-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK2: .omp.lastprivate.done: -// CHECK2-NEXT: br label [[OMP_PRECOND_END]] -// CHECK2: omp.precond.end: -// CHECK2-NEXT: [[TMP56:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK2-NEXT: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[TMP56]]) -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 -// CHECK2-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[I6:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK2-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK2-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 -// CHECK2-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 -// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 -// CHECK2-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* -// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK2-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK2-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK2-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK2: omp.precond.then: -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK2-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK2-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP5]] to i32 -// CHECK2-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK2-NEXT: [[CONV5:%.*]] = trunc i64 [[TMP6]] to i32 -// CHECK2-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_LB]], align 4 -// CHECK2-NEXT: store i32 [[CONV5]], i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) -// CHECK2-NEXT: br label [[OMP_DISPATCH_COND:%.*]] -// CHECK2: omp.dispatch.cond: -// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: [[CONV7:%.*]] = sext i32 [[TMP9]] to i64 -// CHECK2-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK2-NEXT: [[CMP8:%.*]] = icmp ugt i64 [[CONV7]], [[TMP10]] -// CHECK2-NEXT: br i1 [[CMP8]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK2: cond.true: -// CHECK2-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK2-NEXT: br label [[COND_END:%.*]] -// CHECK2: cond.false: -// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: [[CONV9:%.*]] = sext i32 [[TMP12]] to i64 -// CHECK2-NEXT: br label [[COND_END]] -// CHECK2: cond.end: -// CHECK2-NEXT: [[COND:%.*]] = phi i64 [ [[TMP11]], [[COND_TRUE]] ], [ [[CONV9]], [[COND_FALSE]] ] -// CHECK2-NEXT: [[CONV10:%.*]] = trunc i64 [[COND]] to i32 -// CHECK2-NEXT: store i32 [[CONV10]], i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK2-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: [[CMP11:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] -// CHECK2-NEXT: br i1 [[CMP11]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] -// CHECK2: omp.dispatch.body: -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK2: omp.inner.for.cond: -// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: [[CMP12:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] -// CHECK2-NEXT: br i1 [[CMP12]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK2: omp.inner.for.body: -// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 -// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK2-NEXT: store i32 [[ADD]], i32* [[I6]], align 4 -// CHECK2-NEXT: [[TMP19:%.*]] = load i32, i32* [[I6]], align 4 -// CHECK2-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64 -// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM]] -// CHECK2-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 -// CHECK2-NEXT: [[TMP20:%.*]] = load i32, i32* [[I6]], align 4 -// CHECK2-NEXT: store i32 [[TMP20]], i32* [[CONV1]], align 8 -// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK2: omp.body.continue: -// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK2: omp.inner.for.inc: -// CHECK2-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[ADD13:%.*]] = add nsw i32 [[TMP21]], 1 -// CHECK2-NEXT: store i32 [[ADD13]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK2: omp.inner.for.end: -// CHECK2-NEXT: br label [[OMP_DISPATCH_INC:%.*]] -// CHECK2: omp.dispatch.inc: -// CHECK2-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK2-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK2-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_LB]], align 4 -// CHECK2-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD15:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] -// CHECK2-NEXT: store i32 [[ADD15]], i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: br label [[OMP_DISPATCH_COND]] -// CHECK2: omp.dispatch.end: -// CHECK2-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 -// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) -// CHECK2-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 -// CHECK2-NEXT: br i1 [[TMP29]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK2: .omp.lastprivate.then: -// CHECK2-NEXT: [[TMP30:%.*]] = load i32, i32* [[CONV1]], align 8 -// CHECK2-NEXT: store i32 [[TMP30]], i32* [[CONV1]], align 8 -// CHECK2-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK2: .omp.lastprivate.done: -// CHECK2-NEXT: br label [[OMP_PRECOND_END]] -// CHECK2: omp.precond.end: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 -// CHECK2-SAME: (i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 -// CHECK2-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK2-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK2: .execute: -// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK2-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 -// CHECK2-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK2-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] -// CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK2: .omp.deinit: -// CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK2-NEXT: br label [[DOTEXIT:%.*]] -// CHECK2: .exit: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__2 -// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 -// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 -// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK2-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK2-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK2-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK2-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK2: omp.precond.then: -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK2-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 -// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK2-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] -// CHECK2-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK2: cond.true: -// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK2-NEXT: br label [[COND_END:%.*]] -// CHECK2: cond.false: -// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: br label [[COND_END]] -// CHECK2: cond.end: -// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] -// CHECK2-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK2: omp.inner.for.cond: -// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 -// CHECK2-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] -// CHECK2-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK2: omp.inner.for.body: -// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64 -// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 -// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: [[CONV6:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK2-NEXT: store i32 [[TMP18]], i32* [[CONV6]], align 4 -// CHECK2-NEXT: [[TMP19:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK2-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP21:%.*]] = inttoptr i64 [[TMP15]] to i8* -// CHECK2-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 8 -// CHECK2-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP23:%.*]] = inttoptr i64 [[TMP17]] to i8* -// CHECK2-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 8 -// CHECK2-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK2-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP19]] to i8* -// CHECK2-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 8 -// CHECK2-NEXT: [[TMP26:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 -// CHECK2-NEXT: [[TMP27:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* -// CHECK2-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 8 -// CHECK2-NEXT: [[TMP28:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP29:%.*]] = load i32, i32* [[TMP28]], align 4 -// CHECK2-NEXT: [[TMP30:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP29]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP30]], i64 4) -// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK2: omp.inner.for.inc: -// CHECK2-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] -// CHECK2-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] -// CHECK2-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] -// CHECK2-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK2-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP37]], [[TMP38]] -// CHECK2-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] -// CHECK2: cond.true11: -// CHECK2-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK2-NEXT: br label [[COND_END13:%.*]] -// CHECK2: cond.false12: -// CHECK2-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: br label [[COND_END13]] -// CHECK2: cond.end13: -// CHECK2-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP39]], [[COND_TRUE11]] ], [ [[TMP40]], [[COND_FALSE12]] ] -// CHECK2-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: store i32 [[TMP41]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK2: omp.inner.for.end: -// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK2: omp.loop.exit: -// CHECK2-NEXT: [[TMP42:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP43:%.*]] = load i32, i32* [[TMP42]], align 4 -// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP43]]) -// CHECK2-NEXT: br label [[OMP_PRECOND_END]] -// CHECK2: omp.precond.end: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__3 -// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 -// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[I5:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK2-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK2-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK2-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK2-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK2-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK2: omp.precond.then: -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK2-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK2-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP5]] to i32 -// CHECK2-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK2-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP6]] to i32 -// CHECK2-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_LB]], align 4 -// CHECK2-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK2-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK2: omp.inner.for.cond: -// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[CONV6:%.*]] = sext i32 [[TMP10]] to i64 -// CHECK2-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK2-NEXT: [[CMP7:%.*]] = icmp ule i64 [[CONV6]], [[TMP11]] -// CHECK2-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK2: omp.inner.for.body: -// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 -// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK2-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 -// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[I5]], align 4 -// CHECK2-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP13]] to i64 -// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i64 0, i64 [[IDXPROM]] -// CHECK2-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 -// CHECK2-NEXT: [[CONV8:%.*]] = sext i16 [[TMP14]] to i32 -// CHECK2-NEXT: [[ADD9:%.*]] = add nsw i32 [[CONV8]], 1 -// CHECK2-NEXT: [[CONV10:%.*]] = trunc i32 [[ADD9]] to i16 -// CHECK2-NEXT: store i16 [[CONV10]], i16* [[ARRAYIDX]], align 2 -// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK2: omp.body.continue: -// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK2: omp.inner.for.inc: -// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] -// CHECK2-NEXT: store i32 [[ADD11]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK2: omp.inner.for.end: -// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK2: omp.loop.exit: -// CHECK2-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 -// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) -// CHECK2-NEXT: br label [[OMP_PRECOND_END]] -// CHECK2: omp.precond.end: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 -// CHECK2-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 -// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK2-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 -// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK2: .execute: -// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK2-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] -// CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK2: .omp.deinit: -// CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK2-NEXT: br label [[DOTEXIT:%.*]] -// CHECK2: .exit: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__4 -// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 -// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 8 -// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 -// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 -// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 -// CHECK2-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK2: cond.true: -// CHECK2-NEXT: br label [[COND_END:%.*]] -// CHECK2: cond.false: -// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: br label [[COND_END]] -// CHECK2: cond.end: -// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] -// CHECK2-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK2: omp.inner.for.cond: -// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 -// CHECK2-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK2: omp.inner.for.body: -// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 -// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 -// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP8]] to i8* -// CHECK2-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 8 -// CHECK2-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP10]] to i8* -// CHECK2-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 -// CHECK2-NEXT: [[TMP15:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK2-NEXT: [[TMP16:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* -// CHECK2-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 -// CHECK2-NEXT: [[TMP17:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP17]], i64 3) -// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK2: omp.inner.for.inc: -// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] -// CHECK2-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] -// CHECK2-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK2-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP24]], 9 -// CHECK2-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] -// CHECK2: cond.true5: -// CHECK2-NEXT: br label [[COND_END7:%.*]] -// CHECK2: cond.false6: -// CHECK2-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: br label [[COND_END7]] -// CHECK2: cond.end7: -// CHECK2-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP25]], [[COND_FALSE6]] ] -// CHECK2-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: store i32 [[TMP26]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK2: omp.inner.for.end: -// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK2: omp.loop.exit: -// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__5 -// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 -// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK2-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK2-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 -// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK2-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK2-NEXT: [[CONV:%.*]] = trunc i64 [[TMP1]] to i32 -// CHECK2-NEXT: [[TMP2:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK2-NEXT: [[CONV1:%.*]] = trunc i64 [[TMP2]] to i32 -// CHECK2-NEXT: store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4 -// CHECK2-NEXT: store i32 [[CONV1]], i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 -// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK2-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK2: omp.inner.for.cond: -// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[CONV2:%.*]] = sext i32 [[TMP6]] to i64 -// CHECK2-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK2-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV2]], [[TMP7]] -// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK2: omp.inner.for.body: -// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 -// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK2-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 -// CHECK2-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP9]] to i64 -// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM]] -// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -// CHECK2-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1 -// CHECK2-NEXT: store i32 [[ADD3]], i32* [[ARRAYIDX]], align 4 -// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK2: omp.body.continue: -// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK2: omp.inner.for.inc: -// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] -// CHECK2-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK2: omp.inner.for.end: -// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK2: omp.loop.exit: -// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59 -// CHECK2-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 -// CHECK2-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK2-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK2-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 -// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK2: .execute: -// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: [[CONV1:%.*]] = bitcast i64* [[F_CASTED]] to i32* -// CHECK2-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 -// CHECK2-NEXT: [[TMP3:%.*]] = load i64, i64* [[F_CASTED]], align 8 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK2-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i64 [[TMP3]]) #[[ATTR2]] -// CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK2: .omp.deinit: -// CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK2-NEXT: br label [[DOTEXIT:%.*]] -// CHECK2: .exit: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__6 -// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 -// CHECK2-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[K:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 -// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK2-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 -// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 -// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 -// CHECK2-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK2: cond.true: -// CHECK2-NEXT: br label [[COND_END:%.*]] -// CHECK2: cond.false: -// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: br label [[COND_END]] -// CHECK2: cond.end: -// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] -// CHECK2-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK2: omp.inner.for.cond: -// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 -// CHECK2-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK2: omp.inner.for.body: -// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 -// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 -// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: [[CONV3:%.*]] = bitcast i64* [[F_CASTED]] to i32* -// CHECK2-NEXT: store i32 [[TMP11]], i32* [[CONV3]], align 4 -// CHECK2-NEXT: [[TMP12:%.*]] = load i64, i64* [[F_CASTED]], align 8 -// CHECK2-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP8]] to i8* -// CHECK2-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 -// CHECK2-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP16:%.*]] = inttoptr i64 [[TMP10]] to i8* -// CHECK2-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 -// CHECK2-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK2-NEXT: [[TMP18:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* -// CHECK2-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 8 -// CHECK2-NEXT: [[TMP19:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 -// CHECK2-NEXT: [[TMP20:%.*]] = inttoptr i64 [[TMP12]] to i8* -// CHECK2-NEXT: store i8* [[TMP20]], i8** [[TMP19]], align 8 -// CHECK2-NEXT: [[TMP21:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, [10 x [10 x i32]]*, i64)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP21]], i64 4) -// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK2: omp.inner.for.inc: -// CHECK2-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK2-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] -// CHECK2-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP26]], [[TMP27]] -// CHECK2-NEXT: store i32 [[ADD5]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP28]], 99 -// CHECK2-NEXT: br i1 [[CMP6]], label [[COND_TRUE7:%.*]], label [[COND_FALSE8:%.*]] -// CHECK2: cond.true7: -// CHECK2-NEXT: br label [[COND_END9:%.*]] -// CHECK2: cond.false8: -// CHECK2-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: br label [[COND_END9]] -// CHECK2: cond.end9: -// CHECK2-NEXT: [[COND10:%.*]] = phi i32 [ 99, [[COND_TRUE7]] ], [ [[TMP29]], [[COND_FALSE8]] ] -// CHECK2-NEXT: store i32 [[COND10]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: store i32 [[TMP30]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK2: omp.inner.for.end: -// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK2: omp.loop.exit: -// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__7 -// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 -// CHECK2-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[K:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK2-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK2-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK2-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 -// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK2-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK2-NEXT: [[CONV2:%.*]] = trunc i64 [[TMP1]] to i32 -// CHECK2-NEXT: [[TMP2:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK2-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP2]] to i32 -// CHECK2-NEXT: store i32 [[CONV2]], i32* [[DOTOMP_LB]], align 4 -// CHECK2-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 -// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK2-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK2: omp.inner.for.cond: -// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[CONV4:%.*]] = sext i32 [[TMP6]] to i64 -// CHECK2-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK2-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV4]], [[TMP7]] -// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK2: omp.inner.for.body: -// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 -// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 -// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK2-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[DIV5:%.*]] = sdiv i32 [[TMP10]], 10 -// CHECK2-NEXT: [[MUL6:%.*]] = mul nsw i32 [[DIV5]], 10 -// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL6]] -// CHECK2-NEXT: [[MUL7:%.*]] = mul nsw i32 [[SUB]], 1 -// CHECK2-NEXT: [[ADD8:%.*]] = add nsw i32 0, [[MUL7]] -// CHECK2-NEXT: store i32 [[ADD8]], i32* [[J]], align 4 -// CHECK2-NEXT: store i32 10, i32* [[K]], align 4 -// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 -// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 -// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: [[MUL9:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] -// CHECK2-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP11]], [[MUL9]] -// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 -// CHECK2-NEXT: [[ADD11:%.*]] = add nsw i32 [[ADD10]], [[TMP14]] -// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 -// CHECK2-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP15]] to i64 -// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i64 0, i64 [[IDXPROM]] -// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 -// CHECK2-NEXT: [[IDXPROM12:%.*]] = sext i32 [[TMP16]] to i64 -// CHECK2-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i64 0, i64 [[IDXPROM12]] -// CHECK2-NEXT: store i32 [[ADD11]], i32* [[ARRAYIDX13]], align 4 -// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK2: omp.body.continue: -// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK2: omp.inner.for.inc: -// CHECK2-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] -// CHECK2-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK2: omp.inner.for.end: -// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK2: omp.loop.exit: -// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l67 -// CHECK2-SAME: (i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 -// CHECK2-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK2-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK2: .execute: -// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK2-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 -// CHECK2-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK2-NEXT: call void @__omp_outlined__8(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [10 x [10 x i32]]* [[TMP0]]) #[[ATTR2]] -// CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK2: .omp.deinit: -// CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK2-NEXT: br label [[DOTEXIT:%.*]] -// CHECK2: .exit: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__8 -// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 -// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[I8:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[J9:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 -// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK2-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 -// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK2-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 -// CHECK2-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 -// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], [[DIV5]] -// CHECK2-NEXT: [[SUB6:%.*]] = sub nsw i32 [[MUL]], 1 -// CHECK2-NEXT: store i32 [[SUB6]], i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK2-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK2-NEXT: store i32 0, i32* [[J]], align 4 -// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] -// CHECK2-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK2: land.lhs.true: -// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK2-NEXT: [[CMP7:%.*]] = icmp slt i32 0, [[TMP6]] -// CHECK2-NEXT: br i1 [[CMP7]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] -// CHECK2: omp.precond.then: -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK2-NEXT: store i32 [[TMP7]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: [[TMP8:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4 -// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP9]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK2-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP10]], [[TMP11]] -// CHECK2-NEXT: br i1 [[CMP10]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK2: cond.true: -// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK2-NEXT: br label [[COND_END:%.*]] -// CHECK2: cond.false: -// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: br label [[COND_END]] -// CHECK2: cond.end: -// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ [[TMP12]], [[COND_TRUE]] ], [ [[TMP13]], [[COND_FALSE]] ] -// CHECK2-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: store i32 [[TMP14]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK2: omp.inner.for.cond: -// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], 1 -// CHECK2-NEXT: [[CMP11:%.*]] = icmp slt i32 [[TMP15]], [[ADD]] -// CHECK2-NEXT: br i1 [[CMP11]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK2: omp.inner.for.body: -// CHECK2-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: [[TMP18:%.*]] = zext i32 [[TMP17]] to i64 -// CHECK2-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP20:%.*]] = zext i32 [[TMP19]] to i64 -// CHECK2-NEXT: [[TMP21:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: [[CONV12:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK2-NEXT: store i32 [[TMP21]], i32* [[CONV12]], align 4 -// CHECK2-NEXT: [[TMP22:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK2-NEXT: [[TMP23:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP24:%.*]] = inttoptr i64 [[TMP18]] to i8* -// CHECK2-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 8 -// CHECK2-NEXT: [[TMP25:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP26:%.*]] = inttoptr i64 [[TMP20]] to i8* -// CHECK2-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 8 -// CHECK2-NEXT: [[TMP27:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK2-NEXT: [[TMP28:%.*]] = inttoptr i64 [[TMP22]] to i8* -// CHECK2-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 8 -// CHECK2-NEXT: [[TMP29:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 -// CHECK2-NEXT: [[TMP30:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* -// CHECK2-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 8 -// CHECK2-NEXT: [[TMP31:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP32:%.*]] = load i32, i32* [[TMP31]], align 4 -// CHECK2-NEXT: [[TMP33:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP32]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [10 x [10 x i32]]*)* @__omp_outlined__9 to i8*), i8* null, i8** [[TMP33]], i64 4) -// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK2: omp.inner.for.inc: -// CHECK2-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD13:%.*]] = add nsw i32 [[TMP34]], [[TMP35]] -// CHECK2-NEXT: store i32 [[ADD13]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP36]], [[TMP37]] -// CHECK2-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD15:%.*]] = add nsw i32 [[TMP38]], [[TMP39]] -// CHECK2-NEXT: store i32 [[ADD15]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK2-NEXT: [[CMP16:%.*]] = icmp sgt i32 [[TMP40]], [[TMP41]] -// CHECK2-NEXT: br i1 [[CMP16]], label [[COND_TRUE17:%.*]], label [[COND_FALSE18:%.*]] -// CHECK2: cond.true17: -// CHECK2-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK2-NEXT: br label [[COND_END19:%.*]] -// CHECK2: cond.false18: -// CHECK2-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: br label [[COND_END19]] -// CHECK2: cond.end19: -// CHECK2-NEXT: [[COND20:%.*]] = phi i32 [ [[TMP42]], [[COND_TRUE17]] ], [ [[TMP43]], [[COND_FALSE18]] ] -// CHECK2-NEXT: store i32 [[COND20]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: store i32 [[TMP44]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK2: omp.inner.for.end: -// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK2: omp.loop.exit: -// CHECK2-NEXT: [[TMP45:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP46:%.*]] = load i32, i32* [[TMP45]], align 4 -// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP46]]) -// CHECK2-NEXT: br label [[OMP_PRECOND_END]] -// CHECK2: omp.precond.end: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__9 -// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 -// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[I10:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[J11:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK2-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK2-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 -// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK2-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 -// CHECK2-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 -// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], [[DIV5]] -// CHECK2-NEXT: [[SUB6:%.*]] = sub nsw i32 [[MUL]], 1 -// CHECK2-NEXT: store i32 [[SUB6]], i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK2-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK2-NEXT: store i32 0, i32* [[J]], align 4 -// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] -// CHECK2-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK2: land.lhs.true: -// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK2-NEXT: [[CMP7:%.*]] = icmp slt i32 0, [[TMP6]] -// CHECK2-NEXT: br i1 [[CMP7]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] -// CHECK2: omp.precond.then: -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK2-NEXT: store i32 [[TMP7]], i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: [[TMP8:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK2-NEXT: [[CONV8:%.*]] = trunc i64 [[TMP8]] to i32 -// CHECK2-NEXT: [[TMP9:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK2-NEXT: [[CONV9:%.*]] = trunc i64 [[TMP9]] to i32 -// CHECK2-NEXT: store i32 [[CONV8]], i32* [[DOTOMP_LB]], align 4 -// CHECK2-NEXT: store i32 [[CONV9]], i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 -// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP11]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK2-NEXT: store i32 [[TMP12]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK2: omp.inner.for.cond: -// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[CONV12:%.*]] = sext i32 [[TMP13]] to i64 -// CHECK2-NEXT: [[TMP14:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK2-NEXT: [[CMP13:%.*]] = icmp ule i64 [[CONV12]], [[TMP14]] -// CHECK2-NEXT: br i1 [[CMP13]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK2: omp.inner.for.body: -// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK2-NEXT: [[SUB14:%.*]] = sub nsw i32 [[TMP16]], 0 -// CHECK2-NEXT: [[DIV15:%.*]] = sdiv i32 [[SUB14]], 1 -// CHECK2-NEXT: [[MUL16:%.*]] = mul nsw i32 1, [[DIV15]] -// CHECK2-NEXT: [[DIV17:%.*]] = sdiv i32 [[TMP15]], [[MUL16]] -// CHECK2-NEXT: [[MUL18:%.*]] = mul nsw i32 [[DIV17]], 1 -// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL18]] -// CHECK2-NEXT: store i32 [[ADD]], i32* [[I10]], align 4 -// CHECK2-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK2-NEXT: [[SUB19:%.*]] = sub nsw i32 [[TMP19]], 0 -// CHECK2-NEXT: [[DIV20:%.*]] = sdiv i32 [[SUB19]], 1 -// CHECK2-NEXT: [[MUL21:%.*]] = mul nsw i32 1, [[DIV20]] -// CHECK2-NEXT: [[DIV22:%.*]] = sdiv i32 [[TMP18]], [[MUL21]] -// CHECK2-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK2-NEXT: [[SUB23:%.*]] = sub nsw i32 [[TMP20]], 0 -// CHECK2-NEXT: [[DIV24:%.*]] = sdiv i32 [[SUB23]], 1 -// CHECK2-NEXT: [[MUL25:%.*]] = mul nsw i32 1, [[DIV24]] -// CHECK2-NEXT: [[MUL26:%.*]] = mul nsw i32 [[DIV22]], [[MUL25]] -// CHECK2-NEXT: [[SUB27:%.*]] = sub nsw i32 [[TMP17]], [[MUL26]] -// CHECK2-NEXT: [[MUL28:%.*]] = mul nsw i32 [[SUB27]], 1 -// CHECK2-NEXT: [[ADD29:%.*]] = add nsw i32 0, [[MUL28]] -// CHECK2-NEXT: store i32 [[ADD29]], i32* [[J11]], align 4 -// CHECK2-NEXT: [[TMP21:%.*]] = load i32, i32* [[I10]], align 4 -// CHECK2-NEXT: [[TMP22:%.*]] = load i32, i32* [[J11]], align 4 -// CHECK2-NEXT: [[ADD30:%.*]] = add nsw i32 [[TMP21]], [[TMP22]] -// CHECK2-NEXT: [[TMP23:%.*]] = load i32, i32* [[I10]], align 4 -// CHECK2-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP23]] to i64 -// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i64 0, i64 [[IDXPROM]] -// CHECK2-NEXT: [[TMP24:%.*]] = load i32, i32* [[J11]], align 4 -// CHECK2-NEXT: [[IDXPROM31:%.*]] = sext i32 [[TMP24]] to i64 -// CHECK2-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i64 0, i64 [[IDXPROM31]] -// CHECK2-NEXT: store i32 [[ADD30]], i32* [[ARRAYIDX32]], align 4 -// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK2: omp.body.continue: -// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK2: omp.inner.for.inc: -// CHECK2-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD33:%.*]] = add nsw i32 [[TMP25]], [[TMP26]] -// CHECK2-NEXT: store i32 [[ADD33]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK2: omp.inner.for.end: -// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK2: omp.loop.exit: -// CHECK2-NEXT: [[TMP27:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP27]], align 4 -// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP28]]) -// CHECK2-NEXT: br label [[OMP_PRECOND_END]] -// CHECK2: omp.precond.end: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74 -// CHECK2-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 -// CHECK2-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK2-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 -// CHECK2-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 8 -// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK2: .execute: -// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK2-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 -// CHECK2-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK2-NEXT: [[TMP4:%.*]] = load i32*, i32** [[V_ADDR]], align 8 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK2-NEXT: call void @__omp_outlined__10(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i32]* [[TMP0]], i32* [[TMP4]]) #[[ATTR2]] -// CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK2: .omp.deinit: -// CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK2-NEXT: br label [[DOTEXIT:%.*]] -// CHECK2: .exit: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__10 -// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 -// CHECK2-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 8 -// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK2-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 -// CHECK2-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 8 -// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 -// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK2-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK2-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK2-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK2: omp.precond.then: -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK2-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 -// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK2-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] -// CHECK2-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK2: cond.true: -// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK2-NEXT: br label [[COND_END:%.*]] -// CHECK2: cond.false: -// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: br label [[COND_END]] -// CHECK2: cond.end: -// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] -// CHECK2-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK2: omp.inner.for.cond: -// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 -// CHECK2-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] -// CHECK2-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK2: omp.inner.for.body: -// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64 -// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 -// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: [[CONV6:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK2-NEXT: store i32 [[TMP18]], i32* [[CONV6]], align 4 -// CHECK2-NEXT: [[TMP19:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK2-NEXT: [[TMP20:%.*]] = load i32*, i32** [[V_ADDR]], align 8 -// CHECK2-NEXT: [[TMP21:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP22:%.*]] = inttoptr i64 [[TMP15]] to i8* -// CHECK2-NEXT: store i8* [[TMP22]], i8** [[TMP21]], align 8 -// CHECK2-NEXT: [[TMP23:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP24:%.*]] = inttoptr i64 [[TMP17]] to i8* -// CHECK2-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 8 -// CHECK2-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK2-NEXT: [[TMP26:%.*]] = inttoptr i64 [[TMP19]] to i8* -// CHECK2-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 8 -// CHECK2-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 -// CHECK2-NEXT: [[TMP28:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* -// CHECK2-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 8 -// CHECK2-NEXT: [[TMP29:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 4 -// CHECK2-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP20]] to i8* -// CHECK2-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 8 -// CHECK2-NEXT: [[TMP31:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP32:%.*]] = load i32, i32* [[TMP31]], align 4 -// CHECK2-NEXT: [[TMP33:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP32]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i32]*, i32*)* @__omp_outlined__11 to i8*), i8* null, i8** [[TMP33]], i64 5) -// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK2: omp.inner.for.inc: -// CHECK2-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP34]], [[TMP35]] -// CHECK2-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP36]], [[TMP37]] -// CHECK2-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP38]], [[TMP39]] -// CHECK2-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK2-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP40]], [[TMP41]] -// CHECK2-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] -// CHECK2: cond.true11: -// CHECK2-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK2-NEXT: br label [[COND_END13:%.*]] -// CHECK2: cond.false12: -// CHECK2-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: br label [[COND_END13]] -// CHECK2: cond.end13: -// CHECK2-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP42]], [[COND_TRUE11]] ], [ [[TMP43]], [[COND_FALSE12]] ] -// CHECK2-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: store i32 [[TMP44]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK2: omp.inner.for.end: -// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK2: omp.loop.exit: -// CHECK2-NEXT: [[TMP45:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP46:%.*]] = load i32, i32* [[TMP45]], align 4 -// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP46]]) -// CHECK2-NEXT: br label [[OMP_PRECOND_END]] -// CHECK2: omp.precond.end: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__11 -// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 -// CHECK2-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[I5:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK2-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK2-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 -// CHECK2-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 8 -// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 -// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK2-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK2-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK2-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK2: omp.precond.then: -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK2-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK2-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP5]] to i32 -// CHECK2-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK2-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP6]] to i32 -// CHECK2-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_LB]], align 4 -// CHECK2-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK2-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK2: omp.inner.for.cond: -// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[CONV6:%.*]] = sext i32 [[TMP10]] to i64 -// CHECK2-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK2-NEXT: [[CMP7:%.*]] = icmp ule i64 [[CONV6]], [[TMP11]] -// CHECK2-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK2: omp.inner.for.body: -// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 -// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK2-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 -// CHECK2-NEXT: [[TMP13:%.*]] = load i32*, i32** [[V_ADDR]], align 8 -// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[I5]], align 4 -// CHECK2-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP14]] to i64 -// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TMP13]], i64 [[IDXPROM]] -// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[I5]], align 4 -// CHECK2-NEXT: [[IDXPROM8:%.*]] = sext i32 [[TMP16]] to i64 -// CHECK2-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM8]] -// CHECK2-NEXT: store i32 [[TMP15]], i32* [[ARRAYIDX9]], align 4 -// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK2: omp.body.continue: -// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK2: omp.inner.for.inc: -// CHECK2-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] -// CHECK2-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK2: omp.inner.for.end: -// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK2: omp.loop.exit: -// CHECK2-NEXT: [[TMP19:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4 -// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP20]]) -// CHECK2-NEXT: br label [[OMP_PRECOND_END]] -// CHECK2: omp.precond.end: -// CHECK2-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43 -// CHECK3-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 -// CHECK3-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[L_CASTED:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK3-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK3-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 -// CHECK3-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 -// CHECK3-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 -// CHECK3-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* -// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK3: .execute: -// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) -// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK3-NEXT: [[CONV2:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK3-NEXT: store i32 [[TMP2]], i32* [[CONV2]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[CONV1]], align 8 -// CHECK3-NEXT: [[CONV3:%.*]] = bitcast i64* [[L_CASTED]] to i32* -// CHECK3-NEXT: store i32 [[TMP4]], i32* [[CONV3]], align 4 -// CHECK3-NEXT: [[TMP5:%.*]] = load i64, i64* [[L_CASTED]], align 8 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK3-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i32]* [[TMP0]], i64 [[TMP5]]) #[[ATTR2:[0-9]+]] -// CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK3: .omp.deinit: -// CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK3-NEXT: br label [[DOTEXIT:%.*]] -// CHECK3: .exit: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 -// CHECK3-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[I5:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[L_CASTED:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 8 -// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK3-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK3-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 -// CHECK3-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 -// CHECK3-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 -// CHECK3-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* -// CHECK3-NEXT: [[TMP1:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i64 4, i16 1) -// CHECK3-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to %struct._globalized_locals_ty* -// CHECK3-NEXT: [[L2:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP2]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK3-NEXT: store i32 [[TMP3]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP4]], 0 -// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK3-NEXT: [[SUB4:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK3-NEXT: store i32 [[SUB4]], i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] -// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK3: omp.precond.then: -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK3-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP8]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) -// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK3-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]] -// CHECK3-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK3: cond.true: -// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK3-NEXT: br label [[COND_END:%.*]] -// CHECK3: cond.false: -// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: br label [[COND_END]] -// CHECK3: cond.end: -// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] -// CHECK3-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK3: omp.inner.for.cond: -// CHECK3-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP15]], 1 -// CHECK3-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP14]], [[ADD]] -// CHECK3-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK3: omp.inner.for.body: -// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 -// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP19:%.*]] = zext i32 [[TMP18]] to i64 -// CHECK3-NEXT: [[TMP20:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK3-NEXT: [[CONV8:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK3-NEXT: store i32 [[TMP20]], i32* [[CONV8]], align 4 -// CHECK3-NEXT: [[TMP21:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK3-NEXT: [[TMP22:%.*]] = load i32, i32* [[CONV1]], align 8 -// CHECK3-NEXT: [[CONV9:%.*]] = bitcast i64* [[L_CASTED]] to i32* -// CHECK3-NEXT: store i32 [[TMP22]], i32* [[CONV9]], align 4 -// CHECK3-NEXT: [[TMP23:%.*]] = load i64, i64* [[L_CASTED]], align 8 -// CHECK3-NEXT: [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK3-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP17]] to i8* -// CHECK3-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 8 -// CHECK3-NEXT: [[TMP26:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK3-NEXT: [[TMP27:%.*]] = inttoptr i64 [[TMP19]] to i8* -// CHECK3-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 8 -// CHECK3-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK3-NEXT: [[TMP29:%.*]] = inttoptr i64 [[TMP21]] to i8* -// CHECK3-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 8 -// CHECK3-NEXT: [[TMP30:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 -// CHECK3-NEXT: [[TMP31:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* -// CHECK3-NEXT: store i8* [[TMP31]], i8** [[TMP30]], align 8 -// CHECK3-NEXT: [[TMP32:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 4 -// CHECK3-NEXT: [[TMP33:%.*]] = inttoptr i64 [[TMP23]] to i8* -// CHECK3-NEXT: store i8* [[TMP33]], i8** [[TMP32]], align 8 -// CHECK3-NEXT: [[TMP34:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: [[TMP35:%.*]] = load i32, i32* [[TMP34]], align 4 -// CHECK3-NEXT: [[TMP36:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP35]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i32]*, i64)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP36]], i64 5) -// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK3: omp.inner.for.inc: -// CHECK3-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP37]], [[TMP38]] -// CHECK3-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP39]], [[TMP40]] -// CHECK3-NEXT: store i32 [[ADD11]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD12:%.*]] = add nsw i32 [[TMP41]], [[TMP42]] -// CHECK3-NEXT: store i32 [[ADD12]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK3-NEXT: [[CMP13:%.*]] = icmp sgt i32 [[TMP43]], [[TMP44]] -// CHECK3-NEXT: br i1 [[CMP13]], label [[COND_TRUE14:%.*]], label [[COND_FALSE15:%.*]] -// CHECK3: cond.true14: -// CHECK3-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK3-NEXT: br label [[COND_END16:%.*]] -// CHECK3: cond.false15: -// CHECK3-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: br label [[COND_END16]] -// CHECK3: cond.end16: -// CHECK3-NEXT: [[COND17:%.*]] = phi i32 [ [[TMP45]], [[COND_TRUE14]] ], [ [[TMP46]], [[COND_FALSE15]] ] -// CHECK3-NEXT: store i32 [[COND17]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP47]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK3: omp.inner.for.end: -// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK3: omp.loop.exit: -// CHECK3-NEXT: [[TMP48:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: [[TMP49:%.*]] = load i32, i32* [[TMP48]], align 4 -// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP49]]) -// CHECK3-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[TMP51:%.*]] = icmp ne i32 [[TMP50]], 0 -// CHECK3-NEXT: br i1 [[TMP51]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK3: .omp.lastprivate.then: -// CHECK3-NEXT: [[TMP52:%.*]] = load i32, i32* [[CONV1]], align 8 -// CHECK3-NEXT: store i32 [[TMP52]], i32* [[CONV1]], align 8 -// CHECK3-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK3: .omp.lastprivate.done: -// CHECK3-NEXT: br label [[OMP_PRECOND_END]] -// CHECK3: omp.precond.end: -// CHECK3-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP1]]) -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 -// CHECK3-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[I6:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK3-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK3-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK3-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK3-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 -// CHECK3-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 -// CHECK3-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 -// CHECK3-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* -// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK3-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK3-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK3: omp.precond.then: -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK3-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK3-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP5]] to i32 -// CHECK3-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK3-NEXT: [[CONV5:%.*]] = trunc i64 [[TMP6]] to i32 -// CHECK3-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: store i32 [[CONV5]], i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) -// CHECK3-NEXT: br label [[OMP_DISPATCH_COND:%.*]] -// CHECK3: omp.dispatch.cond: -// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: [[CONV7:%.*]] = sext i32 [[TMP9]] to i64 -// CHECK3-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK3-NEXT: [[CMP8:%.*]] = icmp ugt i64 [[CONV7]], [[TMP10]] -// CHECK3-NEXT: br i1 [[CMP8]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK3: cond.true: -// CHECK3-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK3-NEXT: br label [[COND_END:%.*]] -// CHECK3: cond.false: -// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: [[CONV9:%.*]] = sext i32 [[TMP12]] to i64 -// CHECK3-NEXT: br label [[COND_END]] -// CHECK3: cond.end: -// CHECK3-NEXT: [[COND:%.*]] = phi i64 [ [[TMP11]], [[COND_TRUE]] ], [ [[CONV9]], [[COND_FALSE]] ] -// CHECK3-NEXT: [[CONV10:%.*]] = trunc i64 [[COND]] to i32 -// CHECK3-NEXT: store i32 [[CONV10]], i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: [[CMP11:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] -// CHECK3-NEXT: br i1 [[CMP11]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] -// CHECK3: omp.dispatch.body: -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK3: omp.inner.for.cond: -// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: [[CMP12:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] -// CHECK3-NEXT: br i1 [[CMP12]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK3: omp.inner.for.body: -// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 -// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK3-NEXT: store i32 [[ADD]], i32* [[I6]], align 4 -// CHECK3-NEXT: [[TMP19:%.*]] = load i32, i32* [[I6]], align 4 -// CHECK3-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64 -// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM]] -// CHECK3-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 -// CHECK3-NEXT: [[TMP20:%.*]] = load i32, i32* [[I6]], align 4 -// CHECK3-NEXT: store i32 [[TMP20]], i32* [[CONV1]], align 8 -// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK3: omp.body.continue: -// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK3: omp.inner.for.inc: -// CHECK3-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[ADD13:%.*]] = add nsw i32 [[TMP21]], 1 -// CHECK3-NEXT: store i32 [[ADD13]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK3: omp.inner.for.end: -// CHECK3-NEXT: br label [[OMP_DISPATCH_INC:%.*]] -// CHECK3: omp.dispatch.inc: -// CHECK3-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK3-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD15:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] -// CHECK3-NEXT: store i32 [[ADD15]], i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: br label [[OMP_DISPATCH_COND]] -// CHECK3: omp.dispatch.end: -// CHECK3-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 -// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) -// CHECK3-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 -// CHECK3-NEXT: br i1 [[TMP29]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK3: .omp.lastprivate.then: -// CHECK3-NEXT: [[TMP30:%.*]] = load i32, i32* [[CONV1]], align 8 -// CHECK3-NEXT: store i32 [[TMP30]], i32* [[CONV1]], align 8 -// CHECK3-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK3: .omp.lastprivate.done: -// CHECK3-NEXT: br label [[OMP_PRECOND_END]] -// CHECK3: omp.precond.end: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 -// CHECK3-SAME: (i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 -// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK3-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK3-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK3-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK3: .execute: -// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK3-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK3-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK3-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] -// CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK3: .omp.deinit: -// CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK3-NEXT: br label [[DOTEXIT:%.*]] -// CHECK3: .exit: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__2 -// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 -// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 -// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK3-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK3-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK3-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK3-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK3-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK3: omp.precond.then: -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK3-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 -// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK3-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] -// CHECK3-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK3: cond.true: -// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK3-NEXT: br label [[COND_END:%.*]] -// CHECK3: cond.false: -// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: br label [[COND_END]] -// CHECK3: cond.end: -// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] -// CHECK3-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK3: omp.inner.for.cond: -// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 -// CHECK3-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] -// CHECK3-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK3: omp.inner.for.body: -// CHECK3-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64 -// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 -// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK3-NEXT: [[CONV6:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK3-NEXT: store i32 [[TMP18]], i32* [[CONV6]], align 4 -// CHECK3-NEXT: [[TMP19:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK3-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK3-NEXT: [[TMP21:%.*]] = inttoptr i64 [[TMP15]] to i8* -// CHECK3-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 8 -// CHECK3-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK3-NEXT: [[TMP23:%.*]] = inttoptr i64 [[TMP17]] to i8* -// CHECK3-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 8 -// CHECK3-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK3-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP19]] to i8* -// CHECK3-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 8 -// CHECK3-NEXT: [[TMP26:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 -// CHECK3-NEXT: [[TMP27:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* -// CHECK3-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 8 -// CHECK3-NEXT: [[TMP28:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: [[TMP29:%.*]] = load i32, i32* [[TMP28]], align 4 -// CHECK3-NEXT: [[TMP30:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP29]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP30]], i64 4) -// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK3: omp.inner.for.inc: -// CHECK3-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] -// CHECK3-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] -// CHECK3-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] -// CHECK3-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK3-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP37]], [[TMP38]] -// CHECK3-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] -// CHECK3: cond.true11: -// CHECK3-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK3-NEXT: br label [[COND_END13:%.*]] -// CHECK3: cond.false12: -// CHECK3-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: br label [[COND_END13]] -// CHECK3: cond.end13: -// CHECK3-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP39]], [[COND_TRUE11]] ], [ [[TMP40]], [[COND_FALSE12]] ] -// CHECK3-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP41]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK3: omp.inner.for.end: -// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK3: omp.loop.exit: -// CHECK3-NEXT: [[TMP42:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: [[TMP43:%.*]] = load i32, i32* [[TMP42]], align 4 -// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP43]]) -// CHECK3-NEXT: br label [[OMP_PRECOND_END]] -// CHECK3: omp.precond.end: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__3 -// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 -// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[I5:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK3-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK3-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK3-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK3-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK3-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK3-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK3-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK3: omp.precond.then: -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK3-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK3-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP5]] to i32 -// CHECK3-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK3-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP6]] to i32 -// CHECK3-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK3: omp.inner.for.cond: -// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[CONV6:%.*]] = sext i32 [[TMP10]] to i64 -// CHECK3-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK3-NEXT: [[CMP7:%.*]] = icmp ule i64 [[CONV6]], [[TMP11]] -// CHECK3-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK3: omp.inner.for.body: -// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 -// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK3-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 -// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[I5]], align 4 -// CHECK3-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP13]] to i64 -// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i64 0, i64 [[IDXPROM]] -// CHECK3-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 -// CHECK3-NEXT: [[CONV8:%.*]] = sext i16 [[TMP14]] to i32 -// CHECK3-NEXT: [[ADD9:%.*]] = add nsw i32 [[CONV8]], 1 -// CHECK3-NEXT: [[CONV10:%.*]] = trunc i32 [[ADD9]] to i16 -// CHECK3-NEXT: store i16 [[CONV10]], i16* [[ARRAYIDX]], align 2 -// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK3: omp.body.continue: -// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK3: omp.inner.for.inc: -// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] -// CHECK3-NEXT: store i32 [[ADD11]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK3: omp.inner.for.end: -// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK3: omp.loop.exit: -// CHECK3-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 -// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) -// CHECK3-NEXT: br label [[OMP_PRECOND_END]] -// CHECK3: omp.precond.end: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 -// CHECK3-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 -// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK3-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 -// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 -// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK3: .execute: -// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK3-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] -// CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK3: .omp.deinit: -// CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK3-NEXT: br label [[DOTEXIT:%.*]] -// CHECK3: .exit: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__4 -// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 -// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 8 -// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK3-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 -// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 -// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 -// CHECK3-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK3: cond.true: -// CHECK3-NEXT: br label [[COND_END:%.*]] -// CHECK3: cond.false: -// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: br label [[COND_END]] -// CHECK3: cond.end: -// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] -// CHECK3-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK3: omp.inner.for.cond: -// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 -// CHECK3-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK3: omp.inner.for.body: -// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 -// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 -// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK3-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP8]] to i8* -// CHECK3-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 8 -// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK3-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP10]] to i8* -// CHECK3-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 -// CHECK3-NEXT: [[TMP15:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK3-NEXT: [[TMP16:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* -// CHECK3-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 -// CHECK3-NEXT: [[TMP17:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP17]], i64 3) -// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK3: omp.inner.for.inc: -// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] -// CHECK3-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] -// CHECK3-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK3-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP24]], 9 -// CHECK3-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] -// CHECK3: cond.true5: -// CHECK3-NEXT: br label [[COND_END7:%.*]] -// CHECK3: cond.false6: -// CHECK3-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: br label [[COND_END7]] -// CHECK3: cond.end7: -// CHECK3-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP25]], [[COND_FALSE6]] ] -// CHECK3-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP26]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK3: omp.inner.for.end: -// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK3: omp.loop.exit: -// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__5 -// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 -// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK3-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK3-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK3-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 -// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK3-NEXT: [[CONV:%.*]] = trunc i64 [[TMP1]] to i32 -// CHECK3-NEXT: [[TMP2:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK3-NEXT: [[CONV1:%.*]] = trunc i64 [[TMP2]] to i32 -// CHECK3-NEXT: store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: store i32 [[CONV1]], i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 -// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK3: omp.inner.for.cond: -// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[CONV2:%.*]] = sext i32 [[TMP6]] to i64 -// CHECK3-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK3-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV2]], [[TMP7]] -// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK3: omp.inner.for.body: -// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 -// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK3-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 -// CHECK3-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP9]] to i64 -// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM]] -// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -// CHECK3-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1 -// CHECK3-NEXT: store i32 [[ADD3]], i32* [[ARRAYIDX]], align 4 -// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK3: omp.body.continue: -// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK3: omp.inner.for.inc: -// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] -// CHECK3-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK3: omp.inner.for.end: -// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK3: omp.loop.exit: -// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59 -// CHECK3-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 -// CHECK3-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK3-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK3-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 -// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK3-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* -// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK3: .execute: -// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK3-NEXT: [[CONV1:%.*]] = bitcast i64* [[F_CASTED]] to i32* -// CHECK3-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load i64, i64* [[F_CASTED]], align 8 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK3-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i64 [[TMP3]]) #[[ATTR2]] -// CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK3: .omp.deinit: -// CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK3-NEXT: br label [[DOTEXIT:%.*]] -// CHECK3: .exit: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__6 -// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 -// CHECK3-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[K:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 -// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK3-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK3-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 -// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK3-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 -// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 -// CHECK3-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK3: cond.true: -// CHECK3-NEXT: br label [[COND_END:%.*]] -// CHECK3: cond.false: -// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: br label [[COND_END]] -// CHECK3: cond.end: -// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] -// CHECK3-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK3: omp.inner.for.cond: -// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 -// CHECK3-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK3: omp.inner.for.body: -// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 -// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 -// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK3-NEXT: [[CONV3:%.*]] = bitcast i64* [[F_CASTED]] to i32* -// CHECK3-NEXT: store i32 [[TMP11]], i32* [[CONV3]], align 4 -// CHECK3-NEXT: [[TMP12:%.*]] = load i64, i64* [[F_CASTED]], align 8 -// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK3-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP8]] to i8* -// CHECK3-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 -// CHECK3-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK3-NEXT: [[TMP16:%.*]] = inttoptr i64 [[TMP10]] to i8* -// CHECK3-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 -// CHECK3-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK3-NEXT: [[TMP18:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* -// CHECK3-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 8 -// CHECK3-NEXT: [[TMP19:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 -// CHECK3-NEXT: [[TMP20:%.*]] = inttoptr i64 [[TMP12]] to i8* -// CHECK3-NEXT: store i8* [[TMP20]], i8** [[TMP19]], align 8 -// CHECK3-NEXT: [[TMP21:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, [10 x [10 x i32]]*, i64)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP21]], i64 4) -// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK3: omp.inner.for.inc: -// CHECK3-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK3-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] -// CHECK3-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP26]], [[TMP27]] -// CHECK3-NEXT: store i32 [[ADD5]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP28]], 99 -// CHECK3-NEXT: br i1 [[CMP6]], label [[COND_TRUE7:%.*]], label [[COND_FALSE8:%.*]] -// CHECK3: cond.true7: -// CHECK3-NEXT: br label [[COND_END9:%.*]] -// CHECK3: cond.false8: -// CHECK3-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: br label [[COND_END9]] -// CHECK3: cond.end9: -// CHECK3-NEXT: [[COND10:%.*]] = phi i32 [ 99, [[COND_TRUE7]] ], [ [[TMP29]], [[COND_FALSE8]] ] -// CHECK3-NEXT: store i32 [[COND10]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP30]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK3: omp.inner.for.end: -// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK3: omp.loop.exit: -// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__7 -// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 -// CHECK3-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[K:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK3-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK3-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK3-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK3-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 -// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK3-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK3-NEXT: [[CONV2:%.*]] = trunc i64 [[TMP1]] to i32 -// CHECK3-NEXT: [[TMP2:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK3-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP2]] to i32 -// CHECK3-NEXT: store i32 [[CONV2]], i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 -// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK3: omp.inner.for.cond: -// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[CONV4:%.*]] = sext i32 [[TMP6]] to i64 -// CHECK3-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK3-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV4]], [[TMP7]] -// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK3: omp.inner.for.body: -// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 -// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 -// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK3-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[DIV5:%.*]] = sdiv i32 [[TMP10]], 10 -// CHECK3-NEXT: [[MUL6:%.*]] = mul nsw i32 [[DIV5]], 10 -// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL6]] -// CHECK3-NEXT: [[MUL7:%.*]] = mul nsw i32 [[SUB]], 1 -// CHECK3-NEXT: [[ADD8:%.*]] = add nsw i32 0, [[MUL7]] -// CHECK3-NEXT: store i32 [[ADD8]], i32* [[J]], align 4 -// CHECK3-NEXT: store i32 10, i32* [[K]], align 4 -// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 -// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 -// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK3-NEXT: [[MUL9:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] -// CHECK3-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP11]], [[MUL9]] -// CHECK3-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 -// CHECK3-NEXT: [[ADD11:%.*]] = add nsw i32 [[ADD10]], [[TMP14]] -// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 -// CHECK3-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP15]] to i64 -// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i64 0, i64 [[IDXPROM]] -// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 -// CHECK3-NEXT: [[IDXPROM12:%.*]] = sext i32 [[TMP16]] to i64 -// CHECK3-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i64 0, i64 [[IDXPROM12]] -// CHECK3-NEXT: store i32 [[ADD11]], i32* [[ARRAYIDX13]], align 4 -// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK3: omp.body.continue: -// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK3: omp.inner.for.inc: -// CHECK3-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] -// CHECK3-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK3: omp.inner.for.end: -// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK3: omp.loop.exit: -// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l67 -// CHECK3-SAME: (i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 -// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK3-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK3-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK3-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK3: .execute: -// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK3-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK3-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK3-NEXT: call void @__omp_outlined__8(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [10 x [10 x i32]]* [[TMP0]]) #[[ATTR2]] -// CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK3: .omp.deinit: -// CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK3-NEXT: br label [[DOTEXIT:%.*]] -// CHECK3: .exit: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__8 -// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 -// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[I8:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[J9:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 -// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK3-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK3-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK3-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK3-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 -// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK3-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 -// CHECK3-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 -// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], [[DIV5]] -// CHECK3-NEXT: [[SUB6:%.*]] = sub nsw i32 [[MUL]], 1 -// CHECK3-NEXT: store i32 [[SUB6]], i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[J]], align 4 -// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] -// CHECK3-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK3: land.lhs.true: -// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK3-NEXT: [[CMP7:%.*]] = icmp slt i32 0, [[TMP6]] -// CHECK3-NEXT: br i1 [[CMP7]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] -// CHECK3: omp.precond.then: -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK3-NEXT: store i32 [[TMP7]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: [[TMP8:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4 -// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP9]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK3-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP10]], [[TMP11]] -// CHECK3-NEXT: br i1 [[CMP10]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK3: cond.true: -// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK3-NEXT: br label [[COND_END:%.*]] -// CHECK3: cond.false: -// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: br label [[COND_END]] -// CHECK3: cond.end: -// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ [[TMP12]], [[COND_TRUE]] ], [ [[TMP13]], [[COND_FALSE]] ] -// CHECK3-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP14]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK3: omp.inner.for.cond: -// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], 1 -// CHECK3-NEXT: [[CMP11:%.*]] = icmp slt i32 [[TMP15]], [[ADD]] -// CHECK3-NEXT: br i1 [[CMP11]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK3: omp.inner.for.body: -// CHECK3-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP18:%.*]] = zext i32 [[TMP17]] to i64 -// CHECK3-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP20:%.*]] = zext i32 [[TMP19]] to i64 -// CHECK3-NEXT: [[TMP21:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK3-NEXT: [[CONV12:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK3-NEXT: store i32 [[TMP21]], i32* [[CONV12]], align 4 -// CHECK3-NEXT: [[TMP22:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK3-NEXT: [[TMP23:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK3-NEXT: [[TMP24:%.*]] = inttoptr i64 [[TMP18]] to i8* -// CHECK3-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 8 -// CHECK3-NEXT: [[TMP25:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK3-NEXT: [[TMP26:%.*]] = inttoptr i64 [[TMP20]] to i8* -// CHECK3-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 8 -// CHECK3-NEXT: [[TMP27:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK3-NEXT: [[TMP28:%.*]] = inttoptr i64 [[TMP22]] to i8* -// CHECK3-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 8 -// CHECK3-NEXT: [[TMP29:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 -// CHECK3-NEXT: [[TMP30:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* -// CHECK3-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 8 -// CHECK3-NEXT: [[TMP31:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: [[TMP32:%.*]] = load i32, i32* [[TMP31]], align 4 -// CHECK3-NEXT: [[TMP33:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP32]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [10 x [10 x i32]]*)* @__omp_outlined__9 to i8*), i8* null, i8** [[TMP33]], i64 4) -// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK3: omp.inner.for.inc: -// CHECK3-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD13:%.*]] = add nsw i32 [[TMP34]], [[TMP35]] -// CHECK3-NEXT: store i32 [[ADD13]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP36]], [[TMP37]] -// CHECK3-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD15:%.*]] = add nsw i32 [[TMP38]], [[TMP39]] -// CHECK3-NEXT: store i32 [[ADD15]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK3-NEXT: [[CMP16:%.*]] = icmp sgt i32 [[TMP40]], [[TMP41]] -// CHECK3-NEXT: br i1 [[CMP16]], label [[COND_TRUE17:%.*]], label [[COND_FALSE18:%.*]] -// CHECK3: cond.true17: -// CHECK3-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK3-NEXT: br label [[COND_END19:%.*]] -// CHECK3: cond.false18: -// CHECK3-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: br label [[COND_END19]] -// CHECK3: cond.end19: -// CHECK3-NEXT: [[COND20:%.*]] = phi i32 [ [[TMP42]], [[COND_TRUE17]] ], [ [[TMP43]], [[COND_FALSE18]] ] -// CHECK3-NEXT: store i32 [[COND20]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP44]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK3: omp.inner.for.end: -// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK3: omp.loop.exit: -// CHECK3-NEXT: [[TMP45:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: [[TMP46:%.*]] = load i32, i32* [[TMP45]], align 4 -// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP46]]) -// CHECK3-NEXT: br label [[OMP_PRECOND_END]] -// CHECK3: omp.precond.end: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__9 -// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 -// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[I10:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[J11:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK3-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK3-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK3-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK3-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK3-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK3-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 -// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK3-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 -// CHECK3-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 -// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], [[DIV5]] -// CHECK3-NEXT: [[SUB6:%.*]] = sub nsw i32 [[MUL]], 1 -// CHECK3-NEXT: store i32 [[SUB6]], i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[J]], align 4 -// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] -// CHECK3-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK3: land.lhs.true: -// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK3-NEXT: [[CMP7:%.*]] = icmp slt i32 0, [[TMP6]] -// CHECK3-NEXT: br i1 [[CMP7]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] -// CHECK3: omp.precond.then: -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK3-NEXT: store i32 [[TMP7]], i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: [[TMP8:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK3-NEXT: [[CONV8:%.*]] = trunc i64 [[TMP8]] to i32 -// CHECK3-NEXT: [[TMP9:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK3-NEXT: [[CONV9:%.*]] = trunc i64 [[TMP9]] to i32 -// CHECK3-NEXT: store i32 [[CONV8]], i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: store i32 [[CONV9]], i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 -// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP11]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP12]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK3: omp.inner.for.cond: -// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[CONV12:%.*]] = sext i32 [[TMP13]] to i64 -// CHECK3-NEXT: [[TMP14:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK3-NEXT: [[CMP13:%.*]] = icmp ule i64 [[CONV12]], [[TMP14]] -// CHECK3-NEXT: br i1 [[CMP13]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK3: omp.inner.for.body: -// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK3-NEXT: [[SUB14:%.*]] = sub nsw i32 [[TMP16]], 0 -// CHECK3-NEXT: [[DIV15:%.*]] = sdiv i32 [[SUB14]], 1 -// CHECK3-NEXT: [[MUL16:%.*]] = mul nsw i32 1, [[DIV15]] -// CHECK3-NEXT: [[DIV17:%.*]] = sdiv i32 [[TMP15]], [[MUL16]] -// CHECK3-NEXT: [[MUL18:%.*]] = mul nsw i32 [[DIV17]], 1 -// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL18]] -// CHECK3-NEXT: store i32 [[ADD]], i32* [[I10]], align 4 -// CHECK3-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK3-NEXT: [[SUB19:%.*]] = sub nsw i32 [[TMP19]], 0 -// CHECK3-NEXT: [[DIV20:%.*]] = sdiv i32 [[SUB19]], 1 -// CHECK3-NEXT: [[MUL21:%.*]] = mul nsw i32 1, [[DIV20]] -// CHECK3-NEXT: [[DIV22:%.*]] = sdiv i32 [[TMP18]], [[MUL21]] -// CHECK3-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK3-NEXT: [[SUB23:%.*]] = sub nsw i32 [[TMP20]], 0 -// CHECK3-NEXT: [[DIV24:%.*]] = sdiv i32 [[SUB23]], 1 -// CHECK3-NEXT: [[MUL25:%.*]] = mul nsw i32 1, [[DIV24]] -// CHECK3-NEXT: [[MUL26:%.*]] = mul nsw i32 [[DIV22]], [[MUL25]] -// CHECK3-NEXT: [[SUB27:%.*]] = sub nsw i32 [[TMP17]], [[MUL26]] -// CHECK3-NEXT: [[MUL28:%.*]] = mul nsw i32 [[SUB27]], 1 -// CHECK3-NEXT: [[ADD29:%.*]] = add nsw i32 0, [[MUL28]] -// CHECK3-NEXT: store i32 [[ADD29]], i32* [[J11]], align 4 -// CHECK3-NEXT: [[TMP21:%.*]] = load i32, i32* [[I10]], align 4 -// CHECK3-NEXT: [[TMP22:%.*]] = load i32, i32* [[J11]], align 4 -// CHECK3-NEXT: [[ADD30:%.*]] = add nsw i32 [[TMP21]], [[TMP22]] -// CHECK3-NEXT: [[TMP23:%.*]] = load i32, i32* [[I10]], align 4 -// CHECK3-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP23]] to i64 -// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i64 0, i64 [[IDXPROM]] -// CHECK3-NEXT: [[TMP24:%.*]] = load i32, i32* [[J11]], align 4 -// CHECK3-NEXT: [[IDXPROM31:%.*]] = sext i32 [[TMP24]] to i64 -// CHECK3-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i64 0, i64 [[IDXPROM31]] -// CHECK3-NEXT: store i32 [[ADD30]], i32* [[ARRAYIDX32]], align 4 -// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK3: omp.body.continue: -// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK3: omp.inner.for.inc: -// CHECK3-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD33:%.*]] = add nsw i32 [[TMP25]], [[TMP26]] -// CHECK3-NEXT: store i32 [[ADD33]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK3: omp.inner.for.end: -// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK3: omp.loop.exit: -// CHECK3-NEXT: [[TMP27:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP27]], align 4 -// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP28]]) -// CHECK3-NEXT: br label [[OMP_PRECOND_END]] -// CHECK3: omp.precond.end: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74 -// CHECK3-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 -// CHECK3-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 8 -// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK3-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK3-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 -// CHECK3-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 8 -// CHECK3-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 -// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK3: .execute: -// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK3-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK3-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK3-NEXT: [[TMP4:%.*]] = load i32*, i32** [[V_ADDR]], align 8 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK3-NEXT: call void @__omp_outlined__10(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i32]* [[TMP0]], i32* [[TMP4]]) #[[ATTR2]] -// CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK3: .omp.deinit: -// CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK3-NEXT: br label [[DOTEXIT:%.*]] -// CHECK3: .exit: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__10 -// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 -// CHECK3-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 8 -// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 8 -// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK3-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK3-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 -// CHECK3-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 8 -// CHECK3-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 -// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK3-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK3-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK3: omp.precond.then: -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK3-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 -// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK3-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] -// CHECK3-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK3: cond.true: -// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK3-NEXT: br label [[COND_END:%.*]] -// CHECK3: cond.false: -// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: br label [[COND_END]] -// CHECK3: cond.end: -// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] -// CHECK3-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK3: omp.inner.for.cond: -// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 -// CHECK3-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] -// CHECK3-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK3: omp.inner.for.body: -// CHECK3-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64 -// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 -// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK3-NEXT: [[CONV6:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK3-NEXT: store i32 [[TMP18]], i32* [[CONV6]], align 4 -// CHECK3-NEXT: [[TMP19:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK3-NEXT: [[TMP20:%.*]] = load i32*, i32** [[V_ADDR]], align 8 -// CHECK3-NEXT: [[TMP21:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK3-NEXT: [[TMP22:%.*]] = inttoptr i64 [[TMP15]] to i8* -// CHECK3-NEXT: store i8* [[TMP22]], i8** [[TMP21]], align 8 -// CHECK3-NEXT: [[TMP23:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK3-NEXT: [[TMP24:%.*]] = inttoptr i64 [[TMP17]] to i8* -// CHECK3-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 8 -// CHECK3-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK3-NEXT: [[TMP26:%.*]] = inttoptr i64 [[TMP19]] to i8* -// CHECK3-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 8 -// CHECK3-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 -// CHECK3-NEXT: [[TMP28:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* -// CHECK3-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 8 -// CHECK3-NEXT: [[TMP29:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 4 -// CHECK3-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP20]] to i8* -// CHECK3-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 8 -// CHECK3-NEXT: [[TMP31:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: [[TMP32:%.*]] = load i32, i32* [[TMP31]], align 4 -// CHECK3-NEXT: [[TMP33:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP32]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i32]*, i32*)* @__omp_outlined__11 to i8*), i8* null, i8** [[TMP33]], i64 5) -// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK3: omp.inner.for.inc: -// CHECK3-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP34]], [[TMP35]] -// CHECK3-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP36]], [[TMP37]] -// CHECK3-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP38]], [[TMP39]] -// CHECK3-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK3-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP40]], [[TMP41]] -// CHECK3-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] -// CHECK3: cond.true11: -// CHECK3-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK3-NEXT: br label [[COND_END13:%.*]] -// CHECK3: cond.false12: -// CHECK3-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: br label [[COND_END13]] -// CHECK3: cond.end13: -// CHECK3-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP42]], [[COND_TRUE11]] ], [ [[TMP43]], [[COND_FALSE12]] ] -// CHECK3-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP44]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK3: omp.inner.for.end: -// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK3: omp.loop.exit: -// CHECK3-NEXT: [[TMP45:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: [[TMP46:%.*]] = load i32, i32* [[TMP45]], align 4 -// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP46]]) -// CHECK3-NEXT: br label [[OMP_PRECOND_END]] -// CHECK3: omp.precond.end: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__11 -// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 -// CHECK3-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 8 -// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[I5:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK3-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK3-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK3-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK3-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 -// CHECK3-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 8 -// CHECK3-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 -// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK3-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK3-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK3: omp.precond.then: -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK3-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK3-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP5]] to i32 -// CHECK3-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK3-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP6]] to i32 -// CHECK3-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK3: omp.inner.for.cond: -// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[CONV6:%.*]] = sext i32 [[TMP10]] to i64 -// CHECK3-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK3-NEXT: [[CMP7:%.*]] = icmp ule i64 [[CONV6]], [[TMP11]] -// CHECK3-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK3: omp.inner.for.body: -// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 -// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK3-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 -// CHECK3-NEXT: [[TMP13:%.*]] = load i32*, i32** [[V_ADDR]], align 8 -// CHECK3-NEXT: [[TMP14:%.*]] = load i32, i32* [[I5]], align 4 -// CHECK3-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP14]] to i64 -// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TMP13]], i64 [[IDXPROM]] -// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[I5]], align 4 -// CHECK3-NEXT: [[IDXPROM8:%.*]] = sext i32 [[TMP16]] to i64 -// CHECK3-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM8]] -// CHECK3-NEXT: store i32 [[TMP15]], i32* [[ARRAYIDX9]], align 4 -// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK3: omp.body.continue: -// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK3: omp.inner.for.inc: -// CHECK3-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] -// CHECK3-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK3: omp.inner.for.end: -// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK3: omp.loop.exit: -// CHECK3-NEXT: [[TMP19:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4 -// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP20]]) -// CHECK3-NEXT: br label [[OMP_PRECOND_END]] -// CHECK3: omp.precond.end: -// CHECK3-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43 -// CHECK4-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK4-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK4-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK4-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK4-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 -// CHECK4-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK4: .execute: -// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) -// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK4-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 -// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK4-NEXT: store i32 [[TMP4]], i32* [[L_CASTED]], align 4 -// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[L_CASTED]], align 4 -// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK4-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32 [[TMP5]]) #[[ATTR2:[0-9]+]] -// CHECK4-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK4: .omp.deinit: -// CHECK4-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK4-NEXT: br label [[DOTEXIT:%.*]] -// CHECK4: .exit: -// CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK4-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[I4:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 -// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK4-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK4-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK4-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 -// CHECK4-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK4-NEXT: [[TMP1:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* @"_openmp_static_kernel$size", align 4 -// CHECK4-NEXT: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP2]], i16 [[TMP1]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) -// CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4 -// CHECK4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP3]], i32 0 -// CHECK4-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to %struct._globalized_locals_ty* -// CHECK4-NEXT: [[L1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP5]], i32 0, i32 0 -// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK4-NEXT: store i32 [[TMP6]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP7]], 0 -// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK4-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK4-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK4-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP8]] -// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK4: omp.precond.then: -// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK4-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK4-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 -// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP11]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) -// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK4-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]] -// CHECK4-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK4: cond.true: -// CHECK4-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK4-NEXT: br label [[COND_END:%.*]] -// CHECK4: cond.false: -// CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: br label [[COND_END]] -// CHECK4: cond.end: -// CHECK4-NEXT: [[COND:%.*]] = phi i32 [ [[TMP14]], [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ] -// CHECK4-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: store i32 [[TMP16]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK4: omp.inner.for.cond: -// CHECK4-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], 1 -// CHECK4-NEXT: [[CMP6:%.*]] = icmp slt i32 [[TMP17]], [[ADD]] -// CHECK4-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK4: omp.inner.for.body: -// CHECK4-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP21:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK4-NEXT: store i32 [[TMP21]], i32* [[N_CASTED]], align 4 -// CHECK4-NEXT: [[TMP22:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK4-NEXT: [[TMP23:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK4-NEXT: store i32 [[TMP23]], i32* [[L_CASTED]], align 4 -// CHECK4-NEXT: [[TMP24:%.*]] = load i32, i32* [[L_CASTED]], align 4 -// CHECK4-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK4-NEXT: [[TMP26:%.*]] = inttoptr i32 [[TMP19]] to i8* -// CHECK4-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 -// CHECK4-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK4-NEXT: [[TMP28:%.*]] = inttoptr i32 [[TMP20]] to i8* -// CHECK4-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 -// CHECK4-NEXT: [[TMP29:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK4-NEXT: [[TMP30:%.*]] = inttoptr i32 [[TMP22]] to i8* -// CHECK4-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 4 -// CHECK4-NEXT: [[TMP31:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK4-NEXT: [[TMP32:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* -// CHECK4-NEXT: store i8* [[TMP32]], i8** [[TMP31]], align 4 -// CHECK4-NEXT: [[TMP33:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 -// CHECK4-NEXT: [[TMP34:%.*]] = inttoptr i32 [[TMP24]] to i8* -// CHECK4-NEXT: store i8* [[TMP34]], i8** [[TMP33]], align 4 -// CHECK4-NEXT: [[TMP35:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: [[TMP36:%.*]] = load i32, i32* [[TMP35]], align 4 -// CHECK4-NEXT: [[TMP37:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP36]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP37]], i32 5) -// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK4: omp.inner.for.inc: -// CHECK4-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP38]], [[TMP39]] -// CHECK4-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP40]], [[TMP41]] -// CHECK4-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP42]], [[TMP43]] -// CHECK4-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK4-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP44]], [[TMP45]] -// CHECK4-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] -// CHECK4: cond.true11: -// CHECK4-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK4-NEXT: br label [[COND_END13:%.*]] -// CHECK4: cond.false12: -// CHECK4-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: br label [[COND_END13]] -// CHECK4: cond.end13: -// CHECK4-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP46]], [[COND_TRUE11]] ], [ [[TMP47]], [[COND_FALSE12]] ] -// CHECK4-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: store i32 [[TMP48]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK4: omp.inner.for.end: -// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK4: omp.loop.exit: -// CHECK4-NEXT: [[TMP49:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: [[TMP50:%.*]] = load i32, i32* [[TMP49]], align 4 -// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP50]]) -// CHECK4-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK4-NEXT: [[TMP52:%.*]] = icmp ne i32 [[TMP51]], 0 -// CHECK4-NEXT: br i1 [[TMP52]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK4: .omp.lastprivate.then: -// CHECK4-NEXT: [[TMP53:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK4-NEXT: store i32 [[TMP53]], i32* [[L_ADDR]], align 4 -// CHECK4-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK4: .omp.lastprivate.done: -// CHECK4-NEXT: br label [[OMP_PRECOND_END]] -// CHECK4: omp.precond.end: -// CHECK4-NEXT: [[TMP54:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK4-NEXT: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[TMP54]]) -// CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK4-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK4-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK4-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK4-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK4-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK4-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 -// CHECK4-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK4-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK4-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK4-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK4: omp.precond.then: -// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK4-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK4-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 -// CHECK4-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 -// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK4-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) -// CHECK4-NEXT: br label [[OMP_DISPATCH_COND:%.*]] -// CHECK4: omp.dispatch.cond: -// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK4-NEXT: [[CMP4:%.*]] = icmp ugt i32 [[TMP9]], [[TMP10]] -// CHECK4-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK4: cond.true: -// CHECK4-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK4-NEXT: br label [[COND_END:%.*]] -// CHECK4: cond.false: -// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK4-NEXT: br label [[COND_END]] -// CHECK4: cond.end: -// CHECK4-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] -// CHECK4-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 -// CHECK4-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK4-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK4-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] -// CHECK4-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] -// CHECK4: omp.dispatch.body: -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK4: omp.inner.for.cond: -// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK4-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] -// CHECK4-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK4: omp.inner.for.body: -// CHECK4-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 -// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK4-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 -// CHECK4-NEXT: [[TMP19:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP19]] -// CHECK4-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 -// CHECK4-NEXT: [[TMP20:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK4-NEXT: store i32 [[TMP20]], i32* [[L_ADDR]], align 4 -// CHECK4-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK4: omp.body.continue: -// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK4: omp.inner.for.inc: -// CHECK4-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP21]], 1 -// CHECK4-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK4: omp.inner.for.end: -// CHECK4-NEXT: br label [[OMP_DISPATCH_INC:%.*]] -// CHECK4: omp.dispatch.inc: -// CHECK4-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK4-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK4-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_LB]], align 4 -// CHECK4-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK4-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] -// CHECK4-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_UB]], align 4 -// CHECK4-NEXT: br label [[OMP_DISPATCH_COND]] -// CHECK4: omp.dispatch.end: -// CHECK4-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 -// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) -// CHECK4-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK4-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 -// CHECK4-NEXT: br i1 [[TMP29]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK4: .omp.lastprivate.then: -// CHECK4-NEXT: [[TMP30:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK4-NEXT: store i32 [[TMP30]], i32* [[L_ADDR]], align 4 -// CHECK4-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK4: .omp.lastprivate.done: -// CHECK4-NEXT: br label [[OMP_PRECOND_END]] -// CHECK4: omp.precond.end: -// CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 -// CHECK4-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 -// CHECK4-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK4-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK4-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK4-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK4: .execute: -// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK4-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 -// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK4-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] -// CHECK4-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK4: .omp.deinit: -// CHECK4-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK4-NEXT: br label [[DOTEXIT:%.*]] -// CHECK4: .exit: -// CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__2 -// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 -// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 -// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK4-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK4-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK4-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK4-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK4-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK4-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK4: omp.precond.then: -// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK4-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK4-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 -// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK4-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] -// CHECK4-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK4: cond.true: -// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK4-NEXT: br label [[COND_END:%.*]] -// CHECK4: cond.false: -// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: br label [[COND_END]] -// CHECK4: cond.end: -// CHECK4-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] -// CHECK4-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK4: omp.inner.for.cond: -// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 -// CHECK4-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] -// CHECK4-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK4: omp.inner.for.body: -// CHECK4-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK4-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 -// CHECK4-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK4-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK4-NEXT: [[TMP19:%.*]] = inttoptr i32 [[TMP14]] to i8* -// CHECK4-NEXT: store i8* [[TMP19]], i8** [[TMP18]], align 4 -// CHECK4-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK4-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP15]] to i8* -// CHECK4-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 -// CHECK4-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK4-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP17]] to i8* -// CHECK4-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 -// CHECK4-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK4-NEXT: [[TMP25:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* -// CHECK4-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 -// CHECK4-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 -// CHECK4-NEXT: [[TMP28:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP27]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP28]], i32 4) -// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK4: omp.inner.for.inc: -// CHECK4-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP29]], [[TMP30]] -// CHECK4-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] -// CHECK4-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] -// CHECK4-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK4-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] -// CHECK4-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] -// CHECK4: cond.true10: -// CHECK4-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK4-NEXT: br label [[COND_END12:%.*]] -// CHECK4: cond.false11: -// CHECK4-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: br label [[COND_END12]] -// CHECK4: cond.end12: -// CHECK4-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP37]], [[COND_TRUE10]] ], [ [[TMP38]], [[COND_FALSE11]] ] -// CHECK4-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: store i32 [[TMP39]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK4: omp.inner.for.end: -// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK4: omp.loop.exit: -// CHECK4-NEXT: [[TMP40:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 -// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]]) -// CHECK4-NEXT: br label [[OMP_PRECOND_END]] -// CHECK4: omp.precond.end: -// CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__3 -// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 -// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK4-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK4-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK4-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK4-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK4-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK4-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK4-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK4-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK4: omp.precond.then: -// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK4-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK4-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 -// CHECK4-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 -// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK4-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK4-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK4: omp.inner.for.cond: -// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK4-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] -// CHECK4-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK4: omp.inner.for.body: -// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 -// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK4-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 -// CHECK4-NEXT: [[TMP13:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i32 0, i32 [[TMP13]] -// CHECK4-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 -// CHECK4-NEXT: [[CONV:%.*]] = sext i16 [[TMP14]] to i32 -// CHECK4-NEXT: [[ADD5:%.*]] = add nsw i32 [[CONV]], 1 -// CHECK4-NEXT: [[CONV6:%.*]] = trunc i32 [[ADD5]] to i16 -// CHECK4-NEXT: store i16 [[CONV6]], i16* [[ARRAYIDX]], align 2 -// CHECK4-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK4: omp.body.continue: -// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK4: omp.inner.for.inc: -// CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] -// CHECK4-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK4: omp.inner.for.end: -// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK4: omp.loop.exit: -// CHECK4-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 -// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) -// CHECK4-NEXT: br label [[OMP_PRECOND_END]] -// CHECK4: omp.precond.end: -// CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 -// CHECK4-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK4-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK4: .execute: -// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK4-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] -// CHECK4-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK4: .omp.deinit: -// CHECK4-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK4-NEXT: br label [[DOTEXIT:%.*]] -// CHECK4: .exit: -// CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__4 -// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 -// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK4-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK4-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 -// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 -// CHECK4-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK4: cond.true: -// CHECK4-NEXT: br label [[COND_END:%.*]] -// CHECK4: cond.false: -// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: br label [[COND_END]] -// CHECK4: cond.end: -// CHECK4-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] -// CHECK4-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK4: omp.inner.for.cond: -// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 -// CHECK4-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK4: omp.inner.for.body: -// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK4-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP7]] to i8* -// CHECK4-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 -// CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK4-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP8]] to i8* -// CHECK4-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 -// CHECK4-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK4-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* -// CHECK4-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 -// CHECK4-NEXT: [[TMP15:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP15]], i32 3) -// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK4: omp.inner.for.inc: -// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP17]] -// CHECK4-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] -// CHECK4-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] -// CHECK4-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP22]], 9 -// CHECK4-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] -// CHECK4: cond.true5: -// CHECK4-NEXT: br label [[COND_END7:%.*]] -// CHECK4: cond.false6: -// CHECK4-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: br label [[COND_END7]] -// CHECK4: cond.end7: -// CHECK4-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP23]], [[COND_FALSE6]] ] -// CHECK4-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: store i32 [[TMP24]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK4: omp.inner.for.end: -// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK4: omp.loop.exit: -// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) -// CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__5 -// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK4-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK4-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK4-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK4-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 -// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 -// CHECK4-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 -// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK4-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 -// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK4-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK4: omp.inner.for.cond: -// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK4-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] -// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK4: omp.inner.for.body: -// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 -// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK4-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 -// CHECK4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 [[TMP9]] -// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -// CHECK4-NEXT: [[ADD1:%.*]] = add nsw i32 [[TMP10]], 1 -// CHECK4-NEXT: store i32 [[ADD1]], i32* [[ARRAYIDX]], align 4 -// CHECK4-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK4: omp.body.continue: -// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK4: omp.inner.for.inc: -// CHECK4-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] -// CHECK4-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK4: omp.inner.for.end: -// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK4: omp.loop.exit: -// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) -// CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59 -// CHECK4-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK4-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK4-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK4-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 -// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK4: .execute: -// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[F_ADDR]], align 4 -// CHECK4-NEXT: store i32 [[TMP2]], i32* [[F_CASTED]], align 4 -// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[F_CASTED]], align 4 -// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK4-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i32 [[TMP3]]) #[[ATTR2]] -// CHECK4-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK4: .omp.deinit: -// CHECK4-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK4-NEXT: br label [[DOTEXIT:%.*]] -// CHECK4: .exit: -// CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__6 -// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK4-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[K:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 -// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK4-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK4-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 -// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK4-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 -// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 -// CHECK4-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK4: cond.true: -// CHECK4-NEXT: br label [[COND_END:%.*]] -// CHECK4: cond.false: -// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: br label [[COND_END]] -// CHECK4: cond.end: -// CHECK4-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] -// CHECK4-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK4: omp.inner.for.cond: -// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 -// CHECK4-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK4: omp.inner.for.body: -// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[F_ADDR]], align 4 -// CHECK4-NEXT: store i32 [[TMP9]], i32* [[F_CASTED]], align 4 -// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[F_CASTED]], align 4 -// CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK4-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP7]] to i8* -// CHECK4-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 -// CHECK4-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK4-NEXT: [[TMP14:%.*]] = inttoptr i32 [[TMP8]] to i8* -// CHECK4-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 -// CHECK4-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK4-NEXT: [[TMP16:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* -// CHECK4-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 4 -// CHECK4-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK4-NEXT: [[TMP18:%.*]] = inttoptr i32 [[TMP10]] to i8* -// CHECK4-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 4 -// CHECK4-NEXT: [[TMP19:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x [10 x i32]]*, i32)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP19]], i32 4) -// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK4: omp.inner.for.inc: -// CHECK4-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] -// CHECK4-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK4-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] -// CHECK4-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP26]], 99 -// CHECK4-NEXT: br i1 [[CMP5]], label [[COND_TRUE6:%.*]], label [[COND_FALSE7:%.*]] -// CHECK4: cond.true6: -// CHECK4-NEXT: br label [[COND_END8:%.*]] -// CHECK4: cond.false7: -// CHECK4-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: br label [[COND_END8]] -// CHECK4: cond.end8: -// CHECK4-NEXT: [[COND9:%.*]] = phi i32 [ 99, [[COND_TRUE6]] ], [ [[TMP27]], [[COND_FALSE7]] ] -// CHECK4-NEXT: store i32 [[COND9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: store i32 [[TMP28]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK4: omp.inner.for.end: -// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK4: omp.loop.exit: -// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) -// CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__7 -// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK4-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[K:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK4-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK4-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK4-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK4-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 -// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK4-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 -// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 -// CHECK4-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 -// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK4-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 -// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK4-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK4: omp.inner.for.cond: -// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK4-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] -// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK4: omp.inner.for.body: -// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 -// CHECK4-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 -// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK4-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[DIV2:%.*]] = sdiv i32 [[TMP10]], 10 -// CHECK4-NEXT: [[MUL3:%.*]] = mul nsw i32 [[DIV2]], 10 -// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL3]] -// CHECK4-NEXT: [[MUL4:%.*]] = mul nsw i32 [[SUB]], 1 -// CHECK4-NEXT: [[ADD5:%.*]] = add nsw i32 0, [[MUL4]] -// CHECK4-NEXT: store i32 [[ADD5]], i32* [[J]], align 4 -// CHECK4-NEXT: store i32 10, i32* [[K]], align 4 -// CHECK4-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 -// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 -// CHECK4-NEXT: [[TMP13:%.*]] = load i32, i32* [[F_ADDR]], align 4 -// CHECK4-NEXT: [[MUL6:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] -// CHECK4-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP11]], [[MUL6]] -// CHECK4-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 -// CHECK4-NEXT: [[ADD8:%.*]] = add nsw i32 [[ADD7]], [[TMP14]] -// CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 -// CHECK4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP15]] -// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 -// CHECK4-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP16]] -// CHECK4-NEXT: store i32 [[ADD8]], i32* [[ARRAYIDX9]], align 4 -// CHECK4-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK4: omp.body.continue: -// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK4: omp.inner.for.inc: -// CHECK4-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] -// CHECK4-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK4: omp.inner.for.end: -// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK4: omp.loop.exit: -// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) -// CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l67 -// CHECK4-SAME: (i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK4-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK4-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK4-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK4: .execute: -// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK4-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 -// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK4-NEXT: call void @__omp_outlined__8(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [10 x [10 x i32]]* [[TMP0]]) #[[ATTR2]] -// CHECK4-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK4: .omp.deinit: -// CHECK4-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK4-NEXT: br label [[DOTEXIT:%.*]] -// CHECK4: .exit: -// CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__8 -// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 -// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 -// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i64, align 8 -// CHECK4-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i64, align 8 -// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 -// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[I9:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[J10:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 -// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK4-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK4-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK4-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 -// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK4-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64 -// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK4-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 -// CHECK4-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 -// CHECK4-NEXT: [[CONV6:%.*]] = sext i32 [[DIV5]] to i64 -// CHECK4-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV6]] -// CHECK4-NEXT: [[SUB7:%.*]] = sub nsw i64 [[MUL]], 1 -// CHECK4-NEXT: store i64 [[SUB7]], i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK4-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK4-NEXT: store i32 0, i32* [[J]], align 4 -// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] -// CHECK4-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK4: land.lhs.true: -// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK4-NEXT: [[CMP8:%.*]] = icmp slt i32 0, [[TMP6]] -// CHECK4-NEXT: br i1 [[CMP8]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] -// CHECK4: omp.precond.then: -// CHECK4-NEXT: store i64 0, i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK4-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK4-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK4-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK4-NEXT: [[CONV11:%.*]] = zext i32 [[NVPTX_NUM_THREADS]] to i64 -// CHECK4-NEXT: [[TMP8:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4 -// CHECK4-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB1]], i32 [[TMP9]], i32 91, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_COMB_LB]], i64* [[DOTOMP_COMB_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 [[CONV11]]) -// CHECK4-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK4-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK4-NEXT: [[CMP12:%.*]] = icmp sgt i64 [[TMP10]], [[TMP11]] -// CHECK4-NEXT: br i1 [[CMP12]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK4: cond.true: -// CHECK4-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK4-NEXT: br label [[COND_END:%.*]] -// CHECK4: cond.false: -// CHECK4-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK4-NEXT: br label [[COND_END]] -// CHECK4: cond.end: -// CHECK4-NEXT: [[COND:%.*]] = phi i64 [ [[TMP12]], [[COND_TRUE]] ], [ [[TMP13]], [[COND_FALSE]] ] -// CHECK4-NEXT: store i64 [[COND]], i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK4-NEXT: [[TMP14:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK4-NEXT: store i64 [[TMP14]], i64* [[DOTOMP_IV]], align 8 -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK4: omp.inner.for.cond: -// CHECK4-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK4-NEXT: [[TMP16:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK4-NEXT: [[ADD:%.*]] = add nsw i64 [[TMP16]], 1 -// CHECK4-NEXT: [[CMP13:%.*]] = icmp slt i64 [[TMP15]], [[ADD]] -// CHECK4-NEXT: br i1 [[CMP13]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK4: omp.inner.for.body: -// CHECK4-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK4-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32 -// CHECK4-NEXT: [[TMP19:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK4-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32 -// CHECK4-NEXT: [[TMP21:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK4-NEXT: store i32 [[TMP21]], i32* [[N_CASTED]], align 4 -// CHECK4-NEXT: [[TMP22:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK4-NEXT: [[TMP23:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK4-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP18]] to i8* -// CHECK4-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 -// CHECK4-NEXT: [[TMP25:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK4-NEXT: [[TMP26:%.*]] = inttoptr i32 [[TMP20]] to i8* -// CHECK4-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 -// CHECK4-NEXT: [[TMP27:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK4-NEXT: [[TMP28:%.*]] = inttoptr i32 [[TMP22]] to i8* -// CHECK4-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 -// CHECK4-NEXT: [[TMP29:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK4-NEXT: [[TMP30:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* -// CHECK4-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 4 -// CHECK4-NEXT: [[TMP31:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: [[TMP32:%.*]] = load i32, i32* [[TMP31]], align 4 -// CHECK4-NEXT: [[TMP33:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP32]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [10 x [10 x i32]]*)* @__omp_outlined__9 to i8*), i8* null, i8** [[TMP33]], i32 4) -// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK4: omp.inner.for.inc: -// CHECK4-NEXT: [[TMP34:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK4-NEXT: [[TMP35:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK4-NEXT: [[ADD14:%.*]] = add nsw i64 [[TMP34]], [[TMP35]] -// CHECK4-NEXT: store i64 [[ADD14]], i64* [[DOTOMP_IV]], align 8 -// CHECK4-NEXT: [[TMP36:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK4-NEXT: [[TMP37:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK4-NEXT: [[ADD15:%.*]] = add nsw i64 [[TMP36]], [[TMP37]] -// CHECK4-NEXT: store i64 [[ADD15]], i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK4-NEXT: [[TMP38:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK4-NEXT: [[TMP39:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK4-NEXT: [[ADD16:%.*]] = add nsw i64 [[TMP38]], [[TMP39]] -// CHECK4-NEXT: store i64 [[ADD16]], i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK4-NEXT: [[TMP40:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK4-NEXT: [[TMP41:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK4-NEXT: [[CMP17:%.*]] = icmp sgt i64 [[TMP40]], [[TMP41]] -// CHECK4-NEXT: br i1 [[CMP17]], label [[COND_TRUE18:%.*]], label [[COND_FALSE19:%.*]] -// CHECK4: cond.true18: -// CHECK4-NEXT: [[TMP42:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK4-NEXT: br label [[COND_END20:%.*]] -// CHECK4: cond.false19: -// CHECK4-NEXT: [[TMP43:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK4-NEXT: br label [[COND_END20]] -// CHECK4: cond.end20: -// CHECK4-NEXT: [[COND21:%.*]] = phi i64 [ [[TMP42]], [[COND_TRUE18]] ], [ [[TMP43]], [[COND_FALSE19]] ] -// CHECK4-NEXT: store i64 [[COND21]], i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK4-NEXT: [[TMP44:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK4-NEXT: store i64 [[TMP44]], i64* [[DOTOMP_IV]], align 8 -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK4: omp.inner.for.end: -// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK4: omp.loop.exit: -// CHECK4-NEXT: [[TMP45:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: [[TMP46:%.*]] = load i32, i32* [[TMP45]], align 4 -// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP46]]) -// CHECK4-NEXT: br label [[OMP_PRECOND_END]] -// CHECK4: omp.precond.end: -// CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__9 -// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 -// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 -// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_LB:%.*]] = alloca i64, align 8 -// CHECK4-NEXT: [[DOTOMP_UB:%.*]] = alloca i64, align 8 -// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 -// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[I11:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[J12:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK4-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK4-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK4-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK4-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK4-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 -// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK4-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64 -// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK4-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 -// CHECK4-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 -// CHECK4-NEXT: [[CONV6:%.*]] = sext i32 [[DIV5]] to i64 -// CHECK4-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV6]] -// CHECK4-NEXT: [[SUB7:%.*]] = sub nsw i64 [[MUL]], 1 -// CHECK4-NEXT: store i64 [[SUB7]], i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK4-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK4-NEXT: store i32 0, i32* [[J]], align 4 -// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] -// CHECK4-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK4: land.lhs.true: -// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK4-NEXT: [[CMP8:%.*]] = icmp slt i32 0, [[TMP6]] -// CHECK4-NEXT: br i1 [[CMP8]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] -// CHECK4: omp.precond.then: -// CHECK4-NEXT: store i64 0, i64* [[DOTOMP_LB]], align 8 -// CHECK4-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK4-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_UB]], align 8 -// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK4-NEXT: [[CONV9:%.*]] = zext i32 [[TMP8]] to i64 -// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK4-NEXT: [[CONV10:%.*]] = zext i32 [[TMP9]] to i64 -// CHECK4-NEXT: store i64 [[CONV9]], i64* [[DOTOMP_LB]], align 8 -// CHECK4-NEXT: store i64 [[CONV10]], i64* [[DOTOMP_UB]], align 8 -// CHECK4-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK4-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 -// CHECK4-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB2]], i32 [[TMP11]], i32 33, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_LB]], i64* [[DOTOMP_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 1) -// CHECK4-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTOMP_LB]], align 8 -// CHECK4-NEXT: store i64 [[TMP12]], i64* [[DOTOMP_IV]], align 8 -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK4: omp.inner.for.cond: -// CHECK4-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK4-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK4-NEXT: [[CONV13:%.*]] = zext i32 [[TMP14]] to i64 -// CHECK4-NEXT: [[CMP14:%.*]] = icmp sle i64 [[TMP13]], [[CONV13]] -// CHECK4-NEXT: br i1 [[CMP14]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK4: omp.inner.for.body: -// CHECK4-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK4-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP16]], 0 -// CHECK4-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1 -// CHECK4-NEXT: [[MUL17:%.*]] = mul nsw i32 1, [[DIV16]] -// CHECK4-NEXT: [[CONV18:%.*]] = sext i32 [[MUL17]] to i64 -// CHECK4-NEXT: [[DIV19:%.*]] = sdiv i64 [[TMP15]], [[CONV18]] -// CHECK4-NEXT: [[MUL20:%.*]] = mul nsw i64 [[DIV19]], 1 -// CHECK4-NEXT: [[ADD:%.*]] = add nsw i64 0, [[MUL20]] -// CHECK4-NEXT: [[CONV21:%.*]] = trunc i64 [[ADD]] to i32 -// CHECK4-NEXT: store i32 [[CONV21]], i32* [[I11]], align 4 -// CHECK4-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK4-NEXT: [[TMP18:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK4-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK4-NEXT: [[SUB22:%.*]] = sub nsw i32 [[TMP19]], 0 -// CHECK4-NEXT: [[DIV23:%.*]] = sdiv i32 [[SUB22]], 1 -// CHECK4-NEXT: [[MUL24:%.*]] = mul nsw i32 1, [[DIV23]] -// CHECK4-NEXT: [[CONV25:%.*]] = sext i32 [[MUL24]] to i64 -// CHECK4-NEXT: [[DIV26:%.*]] = sdiv i64 [[TMP18]], [[CONV25]] -// CHECK4-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK4-NEXT: [[SUB27:%.*]] = sub nsw i32 [[TMP20]], 0 -// CHECK4-NEXT: [[DIV28:%.*]] = sdiv i32 [[SUB27]], 1 -// CHECK4-NEXT: [[MUL29:%.*]] = mul nsw i32 1, [[DIV28]] -// CHECK4-NEXT: [[CONV30:%.*]] = sext i32 [[MUL29]] to i64 -// CHECK4-NEXT: [[MUL31:%.*]] = mul nsw i64 [[DIV26]], [[CONV30]] -// CHECK4-NEXT: [[SUB32:%.*]] = sub nsw i64 [[TMP17]], [[MUL31]] -// CHECK4-NEXT: [[MUL33:%.*]] = mul nsw i64 [[SUB32]], 1 -// CHECK4-NEXT: [[ADD34:%.*]] = add nsw i64 0, [[MUL33]] -// CHECK4-NEXT: [[CONV35:%.*]] = trunc i64 [[ADD34]] to i32 -// CHECK4-NEXT: store i32 [[CONV35]], i32* [[J12]], align 4 -// CHECK4-NEXT: [[TMP21:%.*]] = load i32, i32* [[I11]], align 4 -// CHECK4-NEXT: [[TMP22:%.*]] = load i32, i32* [[J12]], align 4 -// CHECK4-NEXT: [[ADD36:%.*]] = add nsw i32 [[TMP21]], [[TMP22]] -// CHECK4-NEXT: [[TMP23:%.*]] = load i32, i32* [[I11]], align 4 -// CHECK4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP23]] -// CHECK4-NEXT: [[TMP24:%.*]] = load i32, i32* [[J12]], align 4 -// CHECK4-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP24]] -// CHECK4-NEXT: store i32 [[ADD36]], i32* [[ARRAYIDX37]], align 4 -// CHECK4-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK4: omp.body.continue: -// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK4: omp.inner.for.inc: -// CHECK4-NEXT: [[TMP25:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK4-NEXT: [[TMP26:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK4-NEXT: [[ADD38:%.*]] = add nsw i64 [[TMP25]], [[TMP26]] -// CHECK4-NEXT: store i64 [[ADD38]], i64* [[DOTOMP_IV]], align 8 -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK4: omp.inner.for.end: -// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK4: omp.loop.exit: -// CHECK4-NEXT: [[TMP27:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP27]], align 4 -// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP28]]) -// CHECK4-NEXT: br label [[OMP_PRECOND_END]] -// CHECK4: omp.precond.end: -// CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74 -// CHECK4-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK4-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK4-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK4-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK4-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 -// CHECK4-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK4: .execute: -// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK4-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 -// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK4-NEXT: [[TMP4:%.*]] = load i32*, i32** [[V_ADDR]], align 4 -// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK4-NEXT: call void @__omp_outlined__10(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32* [[TMP4]]) #[[ATTR2]] -// CHECK4-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK4: .omp.deinit: -// CHECK4-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK4-NEXT: br label [[DOTEXIT:%.*]] -// CHECK4: .exit: -// CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__10 -// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK4-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 -// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK4-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK4-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK4-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 -// CHECK4-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK4-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK4-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK4-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK4: omp.precond.then: -// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK4-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK4-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 -// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK4-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] -// CHECK4-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK4: cond.true: -// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK4-NEXT: br label [[COND_END:%.*]] -// CHECK4: cond.false: -// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: br label [[COND_END]] -// CHECK4: cond.end: -// CHECK4-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] -// CHECK4-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK4: omp.inner.for.cond: -// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 -// CHECK4-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] -// CHECK4-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK4: omp.inner.for.body: -// CHECK4-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK4-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 -// CHECK4-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK4-NEXT: [[TMP18:%.*]] = load i32*, i32** [[V_ADDR]], align 4 -// CHECK4-NEXT: [[TMP19:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK4-NEXT: [[TMP20:%.*]] = inttoptr i32 [[TMP14]] to i8* -// CHECK4-NEXT: store i8* [[TMP20]], i8** [[TMP19]], align 4 -// CHECK4-NEXT: [[TMP21:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK4-NEXT: [[TMP22:%.*]] = inttoptr i32 [[TMP15]] to i8* -// CHECK4-NEXT: store i8* [[TMP22]], i8** [[TMP21]], align 4 -// CHECK4-NEXT: [[TMP23:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK4-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP17]] to i8* -// CHECK4-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 -// CHECK4-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK4-NEXT: [[TMP26:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* -// CHECK4-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 -// CHECK4-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 -// CHECK4-NEXT: [[TMP28:%.*]] = bitcast i32* [[TMP18]] to i8* -// CHECK4-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 -// CHECK4-NEXT: [[TMP29:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: [[TMP30:%.*]] = load i32, i32* [[TMP29]], align 4 -// CHECK4-NEXT: [[TMP31:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP30]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32*)* @__omp_outlined__11 to i8*), i8* null, i8** [[TMP31]], i32 5) -// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK4: omp.inner.for.inc: -// CHECK4-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP32]], [[TMP33]] -// CHECK4-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP34]], [[TMP35]] -// CHECK4-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP36]], [[TMP37]] -// CHECK4-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK4-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP38]], [[TMP39]] -// CHECK4-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] -// CHECK4: cond.true10: -// CHECK4-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK4-NEXT: br label [[COND_END12:%.*]] -// CHECK4: cond.false11: -// CHECK4-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: br label [[COND_END12]] -// CHECK4: cond.end12: -// CHECK4-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP40]], [[COND_TRUE10]] ], [ [[TMP41]], [[COND_FALSE11]] ] -// CHECK4-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: store i32 [[TMP42]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK4: omp.inner.for.end: -// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK4: omp.loop.exit: -// CHECK4-NEXT: [[TMP43:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: [[TMP44:%.*]] = load i32, i32* [[TMP43]], align 4 -// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP44]]) -// CHECK4-NEXT: br label [[OMP_PRECOND_END]] -// CHECK4: omp.precond.end: -// CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__11 -// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK4-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK4-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK4-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK4-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK4-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK4-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 -// CHECK4-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK4-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK4-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK4-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK4: omp.precond.then: -// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK4-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK4-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 -// CHECK4-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 -// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK4-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK4-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK4: omp.inner.for.cond: -// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK4-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] -// CHECK4-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK4: omp.inner.for.body: -// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 -// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK4-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 -// CHECK4-NEXT: [[TMP13:%.*]] = load i32*, i32** [[V_ADDR]], align 4 -// CHECK4-NEXT: [[TMP14:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TMP13]], i32 [[TMP14]] -// CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK4-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP16]] -// CHECK4-NEXT: store i32 [[TMP15]], i32* [[ARRAYIDX5]], align 4 -// CHECK4-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK4: omp.body.continue: -// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK4: omp.inner.for.inc: -// CHECK4-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] -// CHECK4-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK4: omp.inner.for.end: -// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK4: omp.loop.exit: -// CHECK4-NEXT: [[TMP19:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4 -// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP20]]) -// CHECK4-NEXT: br label [[OMP_PRECOND_END]] -// CHECK4: omp.precond.end: -// CHECK4-NEXT: ret void -// -// -// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43 -// CHECK5-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK5-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK5-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 -// CHECK5-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK5-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK5-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK5: .execute: -// CHECK5-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) -// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 -// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP4]], i32* [[L_CASTED]], align 4 -// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[L_CASTED]], align 4 -// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK5-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32 [[TMP5]]) #[[ATTR2:[0-9]+]] -// CHECK5-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK5: .omp.deinit: -// CHECK5-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK5-NEXT: br label [[DOTEXIT:%.*]] -// CHECK5: .exit: -// CHECK5-NEXT: ret void -// -// -// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK5-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[I4:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 -// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK5-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 -// CHECK5-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK5-NEXT: [[TMP1:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* @"_openmp_static_kernel$size", align 4 -// CHECK5-NEXT: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP2]], i16 [[TMP1]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) -// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4 -// CHECK5-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP3]], i32 0 -// CHECK5-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to %struct._globalized_locals_ty* -// CHECK5-NEXT: [[L1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP5]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP6]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK5-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP7]], 0 -// CHECK5-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK5-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK5-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK5-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK5-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP8]] -// CHECK5-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK5: omp.precond.then: -// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK5-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK5-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 -// CHECK5-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP11]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) -// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK5-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]] -// CHECK5-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK5: cond.true: -// CHECK5-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK5-NEXT: br label [[COND_END:%.*]] -// CHECK5: cond.false: -// CHECK5-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: br label [[COND_END]] -// CHECK5: cond.end: -// CHECK5-NEXT: [[COND:%.*]] = phi i32 [ [[TMP14]], [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ] -// CHECK5-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK5-NEXT: store i32 [[TMP16]], i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK5: omp.inner.for.cond: -// CHECK5-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], 1 -// CHECK5-NEXT: [[CMP6:%.*]] = icmp slt i32 [[TMP17]], [[ADD]] -// CHECK5-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK5: omp.inner.for.body: -// CHECK5-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK5-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[TMP21:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP21]], i32* [[N_CASTED]], align 4 -// CHECK5-NEXT: [[TMP22:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK5-NEXT: [[TMP23:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP23]], i32* [[L_CASTED]], align 4 -// CHECK5-NEXT: [[TMP24:%.*]] = load i32, i32* [[L_CASTED]], align 4 -// CHECK5-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP26:%.*]] = inttoptr i32 [[TMP19]] to i8* -// CHECK5-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 -// CHECK5-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP28:%.*]] = inttoptr i32 [[TMP20]] to i8* -// CHECK5-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 -// CHECK5-NEXT: [[TMP29:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK5-NEXT: [[TMP30:%.*]] = inttoptr i32 [[TMP22]] to i8* -// CHECK5-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 4 -// CHECK5-NEXT: [[TMP31:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK5-NEXT: [[TMP32:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* -// CHECK5-NEXT: store i8* [[TMP32]], i8** [[TMP31]], align 4 -// CHECK5-NEXT: [[TMP33:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 -// CHECK5-NEXT: [[TMP34:%.*]] = inttoptr i32 [[TMP24]] to i8* -// CHECK5-NEXT: store i8* [[TMP34]], i8** [[TMP33]], align 4 -// CHECK5-NEXT: [[TMP35:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: [[TMP36:%.*]] = load i32, i32* [[TMP35]], align 4 -// CHECK5-NEXT: [[TMP37:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP36]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP37]], i32 5) -// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK5: omp.inner.for.inc: -// CHECK5-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK5-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP38]], [[TMP39]] -// CHECK5-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK5-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK5-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP40]], [[TMP41]] -// CHECK5-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK5-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK5-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP42]], [[TMP43]] -// CHECK5-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK5-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP44]], [[TMP45]] -// CHECK5-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] -// CHECK5: cond.true11: -// CHECK5-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK5-NEXT: br label [[COND_END13:%.*]] -// CHECK5: cond.false12: -// CHECK5-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: br label [[COND_END13]] -// CHECK5: cond.end13: -// CHECK5-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP46]], [[COND_TRUE11]] ], [ [[TMP47]], [[COND_FALSE12]] ] -// CHECK5-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK5-NEXT: store i32 [[TMP48]], i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK5: omp.inner.for.end: -// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK5: omp.loop.exit: -// CHECK5-NEXT: [[TMP49:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: [[TMP50:%.*]] = load i32, i32* [[TMP49]], align 4 -// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP50]]) -// CHECK5-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK5-NEXT: [[TMP52:%.*]] = icmp ne i32 [[TMP51]], 0 -// CHECK5-NEXT: br i1 [[TMP52]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK5: .omp.lastprivate.then: -// CHECK5-NEXT: [[TMP53:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP53]], i32* [[L_ADDR]], align 4 -// CHECK5-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK5: .omp.lastprivate.done: -// CHECK5-NEXT: br label [[OMP_PRECOND_END]] -// CHECK5: omp.precond.end: -// CHECK5-NEXT: [[TMP54:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK5-NEXT: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[TMP54]]) -// CHECK5-NEXT: ret void -// -// -// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK5-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK5-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK5-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK5-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 -// CHECK5-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK5-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK5-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK5-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK5-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK5-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK5-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK5-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK5-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK5: omp.precond.then: -// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK5-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 -// CHECK5-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 -// CHECK5-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK5-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK5-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) -// CHECK5-NEXT: br label [[OMP_DISPATCH_COND:%.*]] -// CHECK5: omp.dispatch.cond: -// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK5-NEXT: [[CMP4:%.*]] = icmp ugt i32 [[TMP9]], [[TMP10]] -// CHECK5-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK5: cond.true: -// CHECK5-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK5-NEXT: br label [[COND_END:%.*]] -// CHECK5: cond.false: -// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK5-NEXT: br label [[COND_END]] -// CHECK5: cond.end: -// CHECK5-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] -// CHECK5-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 -// CHECK5-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK5-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK5-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] -// CHECK5-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] -// CHECK5: omp.dispatch.body: -// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK5: omp.inner.for.cond: -// CHECK5-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK5-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] -// CHECK5-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK5: omp.inner.for.body: -// CHECK5-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 -// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK5-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 -// CHECK5-NEXT: [[TMP19:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK5-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP19]] -// CHECK5-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 -// CHECK5-NEXT: [[TMP20:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK5-NEXT: store i32 [[TMP20]], i32* [[L_ADDR]], align 4 -// CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK5: omp.body.continue: -// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK5: omp.inner.for.inc: -// CHECK5-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP21]], 1 -// CHECK5-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK5: omp.inner.for.end: -// CHECK5-NEXT: br label [[OMP_DISPATCH_INC:%.*]] -// CHECK5: omp.dispatch.inc: -// CHECK5-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK5-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK5-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK5-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_LB]], align 4 -// CHECK5-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK5-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK5-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] -// CHECK5-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_UB]], align 4 -// CHECK5-NEXT: br label [[OMP_DISPATCH_COND]] -// CHECK5: omp.dispatch.end: -// CHECK5-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 -// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) -// CHECK5-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK5-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 -// CHECK5-NEXT: br i1 [[TMP29]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK5: .omp.lastprivate.then: -// CHECK5-NEXT: [[TMP30:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP30]], i32* [[L_ADDR]], align 4 -// CHECK5-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK5: .omp.lastprivate.done: -// CHECK5-NEXT: br label [[OMP_PRECOND_END]] -// CHECK5: omp.precond.end: -// CHECK5-NEXT: ret void -// -// -// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 -// CHECK5-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 -// CHECK5-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK5-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK5-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK5-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK5-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK5: .execute: -// CHECK5-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 -// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK5-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] -// CHECK5-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK5: .omp.deinit: -// CHECK5-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK5-NEXT: br label [[DOTEXIT:%.*]] -// CHECK5: .exit: -// CHECK5-NEXT: ret void -// -// -// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__2 -// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 -// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 -// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK5-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK5-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK5-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK5-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK5-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK5-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK5-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK5-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK5-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK5-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK5: omp.precond.then: -// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK5-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK5-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 -// CHECK5-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK5-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] -// CHECK5-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK5: cond.true: -// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK5-NEXT: br label [[COND_END:%.*]] -// CHECK5: cond.false: -// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: br label [[COND_END]] -// CHECK5: cond.end: -// CHECK5-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] -// CHECK5-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK5-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK5: omp.inner.for.cond: -// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 -// CHECK5-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] -// CHECK5-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK5: omp.inner.for.body: -// CHECK5-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK5-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 -// CHECK5-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK5-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP19:%.*]] = inttoptr i32 [[TMP14]] to i8* -// CHECK5-NEXT: store i8* [[TMP19]], i8** [[TMP18]], align 4 -// CHECK5-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP15]] to i8* -// CHECK5-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 -// CHECK5-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK5-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP17]] to i8* -// CHECK5-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 -// CHECK5-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK5-NEXT: [[TMP25:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* -// CHECK5-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 -// CHECK5-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 -// CHECK5-NEXT: [[TMP28:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP27]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP28]], i32 4) -// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK5: omp.inner.for.inc: -// CHECK5-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK5-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP29]], [[TMP30]] -// CHECK5-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK5-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK5-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] -// CHECK5-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK5-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK5-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] -// CHECK5-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK5-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] -// CHECK5-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] -// CHECK5: cond.true10: -// CHECK5-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK5-NEXT: br label [[COND_END12:%.*]] -// CHECK5: cond.false11: -// CHECK5-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: br label [[COND_END12]] -// CHECK5: cond.end12: -// CHECK5-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP37]], [[COND_TRUE10]] ], [ [[TMP38]], [[COND_FALSE11]] ] -// CHECK5-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK5-NEXT: store i32 [[TMP39]], i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK5: omp.inner.for.end: -// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK5: omp.loop.exit: -// CHECK5-NEXT: [[TMP40:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 -// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]]) -// CHECK5-NEXT: br label [[OMP_PRECOND_END]] -// CHECK5: omp.precond.end: -// CHECK5-NEXT: ret void -// -// -// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__3 -// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 -// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK5-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK5-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK5-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK5-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK5-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK5-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK5-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK5-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK5-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK5-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK5-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK5-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK5: omp.precond.then: -// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK5-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 -// CHECK5-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 -// CHECK5-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK5-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK5-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK5-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK5: omp.inner.for.cond: -// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK5-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] -// CHECK5-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK5: omp.inner.for.body: -// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 -// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK5-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 -// CHECK5-NEXT: [[TMP13:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK5-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i32 0, i32 [[TMP13]] -// CHECK5-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 -// CHECK5-NEXT: [[CONV:%.*]] = sext i16 [[TMP14]] to i32 -// CHECK5-NEXT: [[ADD5:%.*]] = add nsw i32 [[CONV]], 1 -// CHECK5-NEXT: [[CONV6:%.*]] = trunc i32 [[ADD5]] to i16 -// CHECK5-NEXT: store i16 [[CONV6]], i16* [[ARRAYIDX]], align 2 -// CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK5: omp.body.continue: -// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK5: omp.inner.for.inc: -// CHECK5-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK5-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] -// CHECK5-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK5: omp.inner.for.end: -// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK5: omp.loop.exit: -// CHECK5-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 -// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) -// CHECK5-NEXT: br label [[OMP_PRECOND_END]] -// CHECK5: omp.precond.end: -// CHECK5-NEXT: ret void -// -// -// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 -// CHECK5-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK5-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK5-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK5-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK5-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK5: .execute: -// CHECK5-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK5-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] -// CHECK5-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK5: .omp.deinit: -// CHECK5-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK5-NEXT: br label [[DOTEXIT:%.*]] -// CHECK5: .exit: -// CHECK5-NEXT: ret void -// -// -// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__4 -// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 -// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK5-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK5-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK5-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK5-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 -// CHECK5-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 -// CHECK5-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK5: cond.true: -// CHECK5-NEXT: br label [[COND_END:%.*]] -// CHECK5: cond.false: -// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: br label [[COND_END]] -// CHECK5: cond.end: -// CHECK5-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] -// CHECK5-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK5-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK5: omp.inner.for.cond: -// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 -// CHECK5-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK5: omp.inner.for.body: -// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP7]] to i8* -// CHECK5-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 -// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP8]] to i8* -// CHECK5-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 -// CHECK5-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK5-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* -// CHECK5-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 -// CHECK5-NEXT: [[TMP15:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP15]], i32 3) -// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK5: omp.inner.for.inc: -// CHECK5-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP17]] -// CHECK5-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK5-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK5-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] -// CHECK5-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK5-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK5-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] -// CHECK5-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP22]], 9 -// CHECK5-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] -// CHECK5: cond.true5: -// CHECK5-NEXT: br label [[COND_END7:%.*]] -// CHECK5: cond.false6: -// CHECK5-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: br label [[COND_END7]] -// CHECK5: cond.end7: -// CHECK5-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP23]], [[COND_FALSE6]] ] -// CHECK5-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK5-NEXT: store i32 [[TMP24]], i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK5: omp.inner.for.end: -// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK5: omp.loop.exit: -// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) -// CHECK5-NEXT: ret void -// -// -// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__5 -// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK5-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK5-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK5-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK5-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 -// CHECK5-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 -// CHECK5-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 -// CHECK5-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK5-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 -// CHECK5-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK5-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK5: omp.inner.for.cond: -// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK5-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] -// CHECK5-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK5: omp.inner.for.body: -// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 -// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK5-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 -// CHECK5-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 [[TMP9]] -// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -// CHECK5-NEXT: [[ADD1:%.*]] = add nsw i32 [[TMP10]], 1 -// CHECK5-NEXT: store i32 [[ADD1]], i32* [[ARRAYIDX]], align 4 -// CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK5: omp.body.continue: -// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK5: omp.inner.for.inc: -// CHECK5-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK5-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] -// CHECK5-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK5: omp.inner.for.end: -// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK5: omp.loop.exit: -// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) -// CHECK5-NEXT: ret void -// -// -// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59 -// CHECK5-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK5-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK5-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 -// CHECK5-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK5-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK5-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK5: .execute: -// CHECK5-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[F_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP2]], i32* [[F_CASTED]], align 4 -// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[F_CASTED]], align 4 -// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK5-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i32 [[TMP3]]) #[[ATTR2]] -// CHECK5-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK5: .omp.deinit: -// CHECK5-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK5-NEXT: br label [[DOTEXIT:%.*]] -// CHECK5: .exit: -// CHECK5-NEXT: ret void -// -// -// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__6 -// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK5-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[K:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 -// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK5-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 -// CHECK5-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK5-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK5-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 -// CHECK5-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 -// CHECK5-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK5: cond.true: -// CHECK5-NEXT: br label [[COND_END:%.*]] -// CHECK5: cond.false: -// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: br label [[COND_END]] -// CHECK5: cond.end: -// CHECK5-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] -// CHECK5-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK5-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK5: omp.inner.for.cond: -// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 -// CHECK5-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK5: omp.inner.for.body: -// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[F_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP9]], i32* [[F_CASTED]], align 4 -// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[F_CASTED]], align 4 -// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP7]] to i8* -// CHECK5-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 -// CHECK5-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP14:%.*]] = inttoptr i32 [[TMP8]] to i8* -// CHECK5-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 -// CHECK5-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK5-NEXT: [[TMP16:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* -// CHECK5-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 4 -// CHECK5-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK5-NEXT: [[TMP18:%.*]] = inttoptr i32 [[TMP10]] to i8* -// CHECK5-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 4 -// CHECK5-NEXT: [[TMP19:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x [10 x i32]]*, i32)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP19]], i32 4) -// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK5: omp.inner.for.inc: -// CHECK5-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] -// CHECK5-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK5-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK5-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK5-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK5-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK5-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] -// CHECK5-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP26]], 99 -// CHECK5-NEXT: br i1 [[CMP5]], label [[COND_TRUE6:%.*]], label [[COND_FALSE7:%.*]] -// CHECK5: cond.true6: -// CHECK5-NEXT: br label [[COND_END8:%.*]] -// CHECK5: cond.false7: -// CHECK5-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: br label [[COND_END8]] -// CHECK5: cond.end8: -// CHECK5-NEXT: [[COND9:%.*]] = phi i32 [ 99, [[COND_TRUE6]] ], [ [[TMP27]], [[COND_FALSE7]] ] -// CHECK5-NEXT: store i32 [[COND9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK5-NEXT: store i32 [[TMP28]], i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK5: omp.inner.for.end: -// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK5: omp.loop.exit: -// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) -// CHECK5-NEXT: ret void -// -// -// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__7 -// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK5-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[K:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK5-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK5-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 -// CHECK5-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK5-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 -// CHECK5-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 -// CHECK5-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 -// CHECK5-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK5-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 -// CHECK5-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK5-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK5: omp.inner.for.cond: -// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK5-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] -// CHECK5-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK5: omp.inner.for.body: -// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 -// CHECK5-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 -// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK5-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[DIV2:%.*]] = sdiv i32 [[TMP10]], 10 -// CHECK5-NEXT: [[MUL3:%.*]] = mul nsw i32 [[DIV2]], 10 -// CHECK5-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL3]] -// CHECK5-NEXT: [[MUL4:%.*]] = mul nsw i32 [[SUB]], 1 -// CHECK5-NEXT: [[ADD5:%.*]] = add nsw i32 0, [[MUL4]] -// CHECK5-NEXT: store i32 [[ADD5]], i32* [[J]], align 4 -// CHECK5-NEXT: store i32 10, i32* [[K]], align 4 -// CHECK5-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 -// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 -// CHECK5-NEXT: [[TMP13:%.*]] = load i32, i32* [[F_ADDR]], align 4 -// CHECK5-NEXT: [[MUL6:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] -// CHECK5-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP11]], [[MUL6]] -// CHECK5-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 -// CHECK5-NEXT: [[ADD8:%.*]] = add nsw i32 [[ADD7]], [[TMP14]] -// CHECK5-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 -// CHECK5-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP15]] -// CHECK5-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 -// CHECK5-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP16]] -// CHECK5-NEXT: store i32 [[ADD8]], i32* [[ARRAYIDX9]], align 4 -// CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK5: omp.body.continue: -// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK5: omp.inner.for.inc: -// CHECK5-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK5-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] -// CHECK5-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK5: omp.inner.for.end: -// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK5: omp.loop.exit: -// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) -// CHECK5-NEXT: ret void -// -// -// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l67 -// CHECK5-SAME: (i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK5-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK5-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK5-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK5-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK5-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK5: .execute: -// CHECK5-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 -// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK5-NEXT: call void @__omp_outlined__8(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [10 x [10 x i32]]* [[TMP0]]) #[[ATTR2]] -// CHECK5-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK5: .omp.deinit: -// CHECK5-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK5-NEXT: br label [[DOTEXIT:%.*]] -// CHECK5: .exit: -// CHECK5-NEXT: ret void -// -// -// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__8 -// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 -// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 -// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i64, align 8 -// CHECK5-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i64, align 8 -// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 -// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[I9:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[J10:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 -// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK5-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK5-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK5-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK5-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 -// CHECK5-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK5-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64 -// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK5-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 -// CHECK5-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 -// CHECK5-NEXT: [[CONV6:%.*]] = sext i32 [[DIV5]] to i64 -// CHECK5-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV6]] -// CHECK5-NEXT: [[SUB7:%.*]] = sub nsw i64 [[MUL]], 1 -// CHECK5-NEXT: store i64 [[SUB7]], i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK5-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK5-NEXT: store i32 0, i32* [[J]], align 4 -// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK5-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] -// CHECK5-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK5: land.lhs.true: -// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK5-NEXT: [[CMP8:%.*]] = icmp slt i32 0, [[TMP6]] -// CHECK5-NEXT: br i1 [[CMP8]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] -// CHECK5: omp.precond.then: -// CHECK5-NEXT: store i64 0, i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK5-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK5-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK5-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK5-NEXT: [[CONV11:%.*]] = zext i32 [[NVPTX_NUM_THREADS]] to i64 -// CHECK5-NEXT: [[TMP8:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4 -// CHECK5-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB1]], i32 [[TMP9]], i32 91, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_COMB_LB]], i64* [[DOTOMP_COMB_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 [[CONV11]]) -// CHECK5-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK5-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK5-NEXT: [[CMP12:%.*]] = icmp sgt i64 [[TMP10]], [[TMP11]] -// CHECK5-NEXT: br i1 [[CMP12]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK5: cond.true: -// CHECK5-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK5-NEXT: br label [[COND_END:%.*]] -// CHECK5: cond.false: -// CHECK5-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK5-NEXT: br label [[COND_END]] -// CHECK5: cond.end: -// CHECK5-NEXT: [[COND:%.*]] = phi i64 [ [[TMP12]], [[COND_TRUE]] ], [ [[TMP13]], [[COND_FALSE]] ] -// CHECK5-NEXT: store i64 [[COND]], i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK5-NEXT: [[TMP14:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK5-NEXT: store i64 [[TMP14]], i64* [[DOTOMP_IV]], align 8 -// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK5: omp.inner.for.cond: -// CHECK5-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK5-NEXT: [[TMP16:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK5-NEXT: [[ADD:%.*]] = add nsw i64 [[TMP16]], 1 -// CHECK5-NEXT: [[CMP13:%.*]] = icmp slt i64 [[TMP15]], [[ADD]] -// CHECK5-NEXT: br i1 [[CMP13]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK5: omp.inner.for.body: -// CHECK5-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK5-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32 -// CHECK5-NEXT: [[TMP19:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK5-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32 -// CHECK5-NEXT: [[TMP21:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP21]], i32* [[N_CASTED]], align 4 -// CHECK5-NEXT: [[TMP22:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK5-NEXT: [[TMP23:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP18]] to i8* -// CHECK5-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 -// CHECK5-NEXT: [[TMP25:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP26:%.*]] = inttoptr i32 [[TMP20]] to i8* -// CHECK5-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 -// CHECK5-NEXT: [[TMP27:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK5-NEXT: [[TMP28:%.*]] = inttoptr i32 [[TMP22]] to i8* -// CHECK5-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 -// CHECK5-NEXT: [[TMP29:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK5-NEXT: [[TMP30:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* -// CHECK5-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 4 -// CHECK5-NEXT: [[TMP31:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: [[TMP32:%.*]] = load i32, i32* [[TMP31]], align 4 -// CHECK5-NEXT: [[TMP33:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP32]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [10 x [10 x i32]]*)* @__omp_outlined__9 to i8*), i8* null, i8** [[TMP33]], i32 4) -// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK5: omp.inner.for.inc: -// CHECK5-NEXT: [[TMP34:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK5-NEXT: [[TMP35:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK5-NEXT: [[ADD14:%.*]] = add nsw i64 [[TMP34]], [[TMP35]] -// CHECK5-NEXT: store i64 [[ADD14]], i64* [[DOTOMP_IV]], align 8 -// CHECK5-NEXT: [[TMP36:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK5-NEXT: [[TMP37:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK5-NEXT: [[ADD15:%.*]] = add nsw i64 [[TMP36]], [[TMP37]] -// CHECK5-NEXT: store i64 [[ADD15]], i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK5-NEXT: [[TMP38:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK5-NEXT: [[TMP39:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK5-NEXT: [[ADD16:%.*]] = add nsw i64 [[TMP38]], [[TMP39]] -// CHECK5-NEXT: store i64 [[ADD16]], i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK5-NEXT: [[TMP40:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK5-NEXT: [[TMP41:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK5-NEXT: [[CMP17:%.*]] = icmp sgt i64 [[TMP40]], [[TMP41]] -// CHECK5-NEXT: br i1 [[CMP17]], label [[COND_TRUE18:%.*]], label [[COND_FALSE19:%.*]] -// CHECK5: cond.true18: -// CHECK5-NEXT: [[TMP42:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK5-NEXT: br label [[COND_END20:%.*]] -// CHECK5: cond.false19: -// CHECK5-NEXT: [[TMP43:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK5-NEXT: br label [[COND_END20]] -// CHECK5: cond.end20: -// CHECK5-NEXT: [[COND21:%.*]] = phi i64 [ [[TMP42]], [[COND_TRUE18]] ], [ [[TMP43]], [[COND_FALSE19]] ] -// CHECK5-NEXT: store i64 [[COND21]], i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK5-NEXT: [[TMP44:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK5-NEXT: store i64 [[TMP44]], i64* [[DOTOMP_IV]], align 8 -// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK5: omp.inner.for.end: -// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK5: omp.loop.exit: -// CHECK5-NEXT: [[TMP45:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: [[TMP46:%.*]] = load i32, i32* [[TMP45]], align 4 -// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP46]]) -// CHECK5-NEXT: br label [[OMP_PRECOND_END]] -// CHECK5: omp.precond.end: -// CHECK5-NEXT: ret void -// -// -// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__9 -// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 -// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 -// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_LB:%.*]] = alloca i64, align 8 -// CHECK5-NEXT: [[DOTOMP_UB:%.*]] = alloca i64, align 8 -// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 -// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[I11:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[J12:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK5-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK5-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK5-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK5-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK5-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK5-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 -// CHECK5-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK5-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64 -// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK5-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 -// CHECK5-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 -// CHECK5-NEXT: [[CONV6:%.*]] = sext i32 [[DIV5]] to i64 -// CHECK5-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV6]] -// CHECK5-NEXT: [[SUB7:%.*]] = sub nsw i64 [[MUL]], 1 -// CHECK5-NEXT: store i64 [[SUB7]], i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK5-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK5-NEXT: store i32 0, i32* [[J]], align 4 -// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK5-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] -// CHECK5-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK5: land.lhs.true: -// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK5-NEXT: [[CMP8:%.*]] = icmp slt i32 0, [[TMP6]] -// CHECK5-NEXT: br i1 [[CMP8]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] -// CHECK5: omp.precond.then: -// CHECK5-NEXT: store i64 0, i64* [[DOTOMP_LB]], align 8 -// CHECK5-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK5-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_UB]], align 8 -// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK5-NEXT: [[CONV9:%.*]] = zext i32 [[TMP8]] to i64 -// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK5-NEXT: [[CONV10:%.*]] = zext i32 [[TMP9]] to i64 -// CHECK5-NEXT: store i64 [[CONV9]], i64* [[DOTOMP_LB]], align 8 -// CHECK5-NEXT: store i64 [[CONV10]], i64* [[DOTOMP_UB]], align 8 -// CHECK5-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK5-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 -// CHECK5-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB2]], i32 [[TMP11]], i32 33, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_LB]], i64* [[DOTOMP_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 1) -// CHECK5-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTOMP_LB]], align 8 -// CHECK5-NEXT: store i64 [[TMP12]], i64* [[DOTOMP_IV]], align 8 -// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK5: omp.inner.for.cond: -// CHECK5-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK5-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK5-NEXT: [[CONV13:%.*]] = zext i32 [[TMP14]] to i64 -// CHECK5-NEXT: [[CMP14:%.*]] = icmp sle i64 [[TMP13]], [[CONV13]] -// CHECK5-NEXT: br i1 [[CMP14]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK5: omp.inner.for.body: -// CHECK5-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK5-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK5-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP16]], 0 -// CHECK5-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1 -// CHECK5-NEXT: [[MUL17:%.*]] = mul nsw i32 1, [[DIV16]] -// CHECK5-NEXT: [[CONV18:%.*]] = sext i32 [[MUL17]] to i64 -// CHECK5-NEXT: [[DIV19:%.*]] = sdiv i64 [[TMP15]], [[CONV18]] -// CHECK5-NEXT: [[MUL20:%.*]] = mul nsw i64 [[DIV19]], 1 -// CHECK5-NEXT: [[ADD:%.*]] = add nsw i64 0, [[MUL20]] -// CHECK5-NEXT: [[CONV21:%.*]] = trunc i64 [[ADD]] to i32 -// CHECK5-NEXT: store i32 [[CONV21]], i32* [[I11]], align 4 -// CHECK5-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK5-NEXT: [[TMP18:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK5-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK5-NEXT: [[SUB22:%.*]] = sub nsw i32 [[TMP19]], 0 -// CHECK5-NEXT: [[DIV23:%.*]] = sdiv i32 [[SUB22]], 1 -// CHECK5-NEXT: [[MUL24:%.*]] = mul nsw i32 1, [[DIV23]] -// CHECK5-NEXT: [[CONV25:%.*]] = sext i32 [[MUL24]] to i64 -// CHECK5-NEXT: [[DIV26:%.*]] = sdiv i64 [[TMP18]], [[CONV25]] -// CHECK5-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK5-NEXT: [[SUB27:%.*]] = sub nsw i32 [[TMP20]], 0 -// CHECK5-NEXT: [[DIV28:%.*]] = sdiv i32 [[SUB27]], 1 -// CHECK5-NEXT: [[MUL29:%.*]] = mul nsw i32 1, [[DIV28]] -// CHECK5-NEXT: [[CONV30:%.*]] = sext i32 [[MUL29]] to i64 -// CHECK5-NEXT: [[MUL31:%.*]] = mul nsw i64 [[DIV26]], [[CONV30]] -// CHECK5-NEXT: [[SUB32:%.*]] = sub nsw i64 [[TMP17]], [[MUL31]] -// CHECK5-NEXT: [[MUL33:%.*]] = mul nsw i64 [[SUB32]], 1 -// CHECK5-NEXT: [[ADD34:%.*]] = add nsw i64 0, [[MUL33]] -// CHECK5-NEXT: [[CONV35:%.*]] = trunc i64 [[ADD34]] to i32 -// CHECK5-NEXT: store i32 [[CONV35]], i32* [[J12]], align 4 -// CHECK5-NEXT: [[TMP21:%.*]] = load i32, i32* [[I11]], align 4 -// CHECK5-NEXT: [[TMP22:%.*]] = load i32, i32* [[J12]], align 4 -// CHECK5-NEXT: [[ADD36:%.*]] = add nsw i32 [[TMP21]], [[TMP22]] -// CHECK5-NEXT: [[TMP23:%.*]] = load i32, i32* [[I11]], align 4 -// CHECK5-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP23]] -// CHECK5-NEXT: [[TMP24:%.*]] = load i32, i32* [[J12]], align 4 -// CHECK5-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP24]] -// CHECK5-NEXT: store i32 [[ADD36]], i32* [[ARRAYIDX37]], align 4 -// CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK5: omp.body.continue: -// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK5: omp.inner.for.inc: -// CHECK5-NEXT: [[TMP25:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK5-NEXT: [[TMP26:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK5-NEXT: [[ADD38:%.*]] = add nsw i64 [[TMP25]], [[TMP26]] -// CHECK5-NEXT: store i64 [[ADD38]], i64* [[DOTOMP_IV]], align 8 -// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK5: omp.inner.for.end: -// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK5: omp.loop.exit: -// CHECK5-NEXT: [[TMP27:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP27]], align 4 -// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP28]]) -// CHECK5-NEXT: br label [[OMP_PRECOND_END]] -// CHECK5: omp.precond.end: -// CHECK5-NEXT: ret void -// -// -// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74 -// CHECK5-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK5-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK5-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK5-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 -// CHECK5-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK5-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK5-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK5: .execute: -// CHECK5-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 -// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK5-NEXT: [[TMP4:%.*]] = load i32*, i32** [[V_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK5-NEXT: call void @__omp_outlined__10(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32* [[TMP4]]) #[[ATTR2]] -// CHECK5-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK5: .omp.deinit: -// CHECK5-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK5-NEXT: br label [[DOTEXIT:%.*]] -// CHECK5: .exit: -// CHECK5-NEXT: ret void -// -// -// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__10 -// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK5-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 -// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK5-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK5-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 -// CHECK5-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK5-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK5-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK5-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK5-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK5-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK5-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK5-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK5-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK5: omp.precond.then: -// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK5-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK5-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 -// CHECK5-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK5-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] -// CHECK5-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK5: cond.true: -// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK5-NEXT: br label [[COND_END:%.*]] -// CHECK5: cond.false: -// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: br label [[COND_END]] -// CHECK5: cond.end: -// CHECK5-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] -// CHECK5-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK5-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK5: omp.inner.for.cond: -// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 -// CHECK5-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] -// CHECK5-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK5: omp.inner.for.body: -// CHECK5-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK5-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 -// CHECK5-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK5-NEXT: [[TMP18:%.*]] = load i32*, i32** [[V_ADDR]], align 4 -// CHECK5-NEXT: [[TMP19:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP20:%.*]] = inttoptr i32 [[TMP14]] to i8* -// CHECK5-NEXT: store i8* [[TMP20]], i8** [[TMP19]], align 4 -// CHECK5-NEXT: [[TMP21:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP22:%.*]] = inttoptr i32 [[TMP15]] to i8* -// CHECK5-NEXT: store i8* [[TMP22]], i8** [[TMP21]], align 4 -// CHECK5-NEXT: [[TMP23:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK5-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP17]] to i8* -// CHECK5-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 -// CHECK5-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK5-NEXT: [[TMP26:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* -// CHECK5-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 -// CHECK5-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 -// CHECK5-NEXT: [[TMP28:%.*]] = bitcast i32* [[TMP18]] to i8* -// CHECK5-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 -// CHECK5-NEXT: [[TMP29:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: [[TMP30:%.*]] = load i32, i32* [[TMP29]], align 4 -// CHECK5-NEXT: [[TMP31:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP30]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32*)* @__omp_outlined__11 to i8*), i8* null, i8** [[TMP31]], i32 5) -// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK5: omp.inner.for.inc: -// CHECK5-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK5-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP32]], [[TMP33]] -// CHECK5-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK5-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK5-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP34]], [[TMP35]] -// CHECK5-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK5-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK5-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP36]], [[TMP37]] -// CHECK5-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK5-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP38]], [[TMP39]] -// CHECK5-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] -// CHECK5: cond.true10: -// CHECK5-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK5-NEXT: br label [[COND_END12:%.*]] -// CHECK5: cond.false11: -// CHECK5-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: br label [[COND_END12]] -// CHECK5: cond.end12: -// CHECK5-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP40]], [[COND_TRUE10]] ], [ [[TMP41]], [[COND_FALSE11]] ] -// CHECK5-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK5-NEXT: store i32 [[TMP42]], i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK5: omp.inner.for.end: -// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK5: omp.loop.exit: -// CHECK5-NEXT: [[TMP43:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: [[TMP44:%.*]] = load i32, i32* [[TMP43]], align 4 -// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP44]]) -// CHECK5-NEXT: br label [[OMP_PRECOND_END]] -// CHECK5: omp.precond.end: -// CHECK5-NEXT: ret void -// -// -// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__11 -// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK5-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK5-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK5-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK5-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK5-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 -// CHECK5-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK5-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK5-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK5-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK5-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK5-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK5-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK5-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK5-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK5: omp.precond.then: -// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK5-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 -// CHECK5-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 -// CHECK5-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK5-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK5-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK5-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK5: omp.inner.for.cond: -// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK5-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] -// CHECK5-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK5: omp.inner.for.body: -// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 -// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK5-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 -// CHECK5-NEXT: [[TMP13:%.*]] = load i32*, i32** [[V_ADDR]], align 4 -// CHECK5-NEXT: [[TMP14:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK5-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TMP13]], i32 [[TMP14]] -// CHECK5-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -// CHECK5-NEXT: [[TMP16:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK5-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP16]] -// CHECK5-NEXT: store i32 [[TMP15]], i32* [[ARRAYIDX5]], align 4 -// CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK5: omp.body.continue: -// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK5: omp.inner.for.inc: -// CHECK5-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK5-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] -// CHECK5-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK5: omp.inner.for.end: -// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK5: omp.loop.exit: -// CHECK5-NEXT: [[TMP19:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4 -// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP20]]) -// CHECK5-NEXT: br label [[OMP_PRECOND_END]] -// CHECK5: omp.precond.end: -// CHECK5-NEXT: ret void -// -// -// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43 -// CHECK6-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK6-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK6-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 -// CHECK6-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK6-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK6-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK6: .execute: -// CHECK6-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) -// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 -// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP4]], i32* [[L_CASTED]], align 4 -// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[L_CASTED]], align 4 -// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK6-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32 [[TMP5]]) #[[ATTR2:[0-9]+]] -// CHECK6-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK6: .omp.deinit: -// CHECK6-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK6-NEXT: br label [[DOTEXIT:%.*]] -// CHECK6: .exit: -// CHECK6-NEXT: ret void -// -// -// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK6-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[I4:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 -// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK6-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK6-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 -// CHECK6-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK6-NEXT: [[TMP1:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 4, i16 1) -// CHECK6-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to %struct._globalized_locals_ty* -// CHECK6-NEXT: [[L1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP2]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP3]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK6-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP4]], 0 -// CHECK6-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK6-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK6-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK6-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK6-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] -// CHECK6-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK6: omp.precond.then: -// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK6-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK6-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP8]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) -// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK6-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]] -// CHECK6-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK6: cond.true: -// CHECK6-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK6-NEXT: br label [[COND_END:%.*]] -// CHECK6: cond.false: -// CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: br label [[COND_END]] -// CHECK6: cond.end: -// CHECK6-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] -// CHECK6-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK6-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK6: omp.inner.for.cond: -// CHECK6-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP15]], 1 -// CHECK6-NEXT: [[CMP6:%.*]] = icmp slt i32 [[TMP14]], [[ADD]] -// CHECK6-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK6: omp.inner.for.body: -// CHECK6-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK6-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[TMP18:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP18]], i32* [[N_CASTED]], align 4 -// CHECK6-NEXT: [[TMP19:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK6-NEXT: [[TMP20:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP20]], i32* [[L_CASTED]], align 4 -// CHECK6-NEXT: [[TMP21:%.*]] = load i32, i32* [[L_CASTED]], align 4 -// CHECK6-NEXT: [[TMP22:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP16]] to i8* -// CHECK6-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 -// CHECK6-NEXT: [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP25:%.*]] = inttoptr i32 [[TMP17]] to i8* -// CHECK6-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 -// CHECK6-NEXT: [[TMP26:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK6-NEXT: [[TMP27:%.*]] = inttoptr i32 [[TMP19]] to i8* -// CHECK6-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 4 -// CHECK6-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK6-NEXT: [[TMP29:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* -// CHECK6-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 4 -// CHECK6-NEXT: [[TMP30:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 -// CHECK6-NEXT: [[TMP31:%.*]] = inttoptr i32 [[TMP21]] to i8* -// CHECK6-NEXT: store i8* [[TMP31]], i8** [[TMP30]], align 4 -// CHECK6-NEXT: [[TMP32:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: [[TMP33:%.*]] = load i32, i32* [[TMP32]], align 4 -// CHECK6-NEXT: [[TMP34:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP33]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP34]], i32 5) -// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK6: omp.inner.for.inc: -// CHECK6-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK6-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] -// CHECK6-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK6-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK6-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP37]], [[TMP38]] -// CHECK6-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK6-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK6-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP39]], [[TMP40]] -// CHECK6-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK6-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]] -// CHECK6-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] -// CHECK6: cond.true11: -// CHECK6-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK6-NEXT: br label [[COND_END13:%.*]] -// CHECK6: cond.false12: -// CHECK6-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: br label [[COND_END13]] -// CHECK6: cond.end13: -// CHECK6-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP43]], [[COND_TRUE11]] ], [ [[TMP44]], [[COND_FALSE12]] ] -// CHECK6-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK6-NEXT: store i32 [[TMP45]], i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK6: omp.inner.for.end: -// CHECK6-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK6: omp.loop.exit: -// CHECK6-NEXT: [[TMP46:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: [[TMP47:%.*]] = load i32, i32* [[TMP46]], align 4 -// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP47]]) -// CHECK6-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK6-NEXT: [[TMP49:%.*]] = icmp ne i32 [[TMP48]], 0 -// CHECK6-NEXT: br i1 [[TMP49]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK6: .omp.lastprivate.then: -// CHECK6-NEXT: [[TMP50:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP50]], i32* [[L_ADDR]], align 4 -// CHECK6-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK6: .omp.lastprivate.done: -// CHECK6-NEXT: br label [[OMP_PRECOND_END]] -// CHECK6: omp.precond.end: -// CHECK6-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP1]]) -// CHECK6-NEXT: ret void -// -// -// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK6-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK6-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK6-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK6-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK6-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 -// CHECK6-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK6-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK6-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK6-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK6-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK6-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK6-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK6-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK6-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK6: omp.precond.then: -// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK6-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 -// CHECK6-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 -// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK6-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) -// CHECK6-NEXT: br label [[OMP_DISPATCH_COND:%.*]] -// CHECK6: omp.dispatch.cond: -// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK6-NEXT: [[CMP4:%.*]] = icmp ugt i32 [[TMP9]], [[TMP10]] -// CHECK6-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK6: cond.true: -// CHECK6-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK6-NEXT: br label [[COND_END:%.*]] -// CHECK6: cond.false: -// CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK6-NEXT: br label [[COND_END]] -// CHECK6: cond.end: -// CHECK6-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] -// CHECK6-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 -// CHECK6-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK6-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK6-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] -// CHECK6-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] -// CHECK6: omp.dispatch.body: -// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK6: omp.inner.for.cond: -// CHECK6-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK6-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] -// CHECK6-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK6: omp.inner.for.body: -// CHECK6-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 -// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK6-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 -// CHECK6-NEXT: [[TMP19:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK6-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP19]] -// CHECK6-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 -// CHECK6-NEXT: [[TMP20:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK6-NEXT: store i32 [[TMP20]], i32* [[L_ADDR]], align 4 -// CHECK6-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK6: omp.body.continue: -// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK6: omp.inner.for.inc: -// CHECK6-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP21]], 1 -// CHECK6-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK6: omp.inner.for.end: -// CHECK6-NEXT: br label [[OMP_DISPATCH_INC:%.*]] -// CHECK6: omp.dispatch.inc: -// CHECK6-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK6-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK6-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK6-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_LB]], align 4 -// CHECK6-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK6-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK6-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] -// CHECK6-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_UB]], align 4 -// CHECK6-NEXT: br label [[OMP_DISPATCH_COND]] -// CHECK6: omp.dispatch.end: -// CHECK6-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 -// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) -// CHECK6-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK6-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 -// CHECK6-NEXT: br i1 [[TMP29]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK6: .omp.lastprivate.then: -// CHECK6-NEXT: [[TMP30:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP30]], i32* [[L_ADDR]], align 4 -// CHECK6-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK6: .omp.lastprivate.done: -// CHECK6-NEXT: br label [[OMP_PRECOND_END]] -// CHECK6: omp.precond.end: -// CHECK6-NEXT: ret void -// -// -// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 -// CHECK6-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 -// CHECK6-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK6-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK6-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK6-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK6-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK6: .execute: -// CHECK6-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 -// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK6-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] -// CHECK6-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK6: .omp.deinit: -// CHECK6-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK6-NEXT: br label [[DOTEXIT:%.*]] -// CHECK6: .exit: -// CHECK6-NEXT: ret void -// -// -// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__2 -// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 -// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 -// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK6-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK6-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK6-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK6-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK6-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK6-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK6-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK6-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK6-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK6-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK6-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK6: omp.precond.then: -// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK6-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK6-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 -// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK6-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] -// CHECK6-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK6: cond.true: -// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK6-NEXT: br label [[COND_END:%.*]] -// CHECK6: cond.false: -// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: br label [[COND_END]] -// CHECK6: cond.end: -// CHECK6-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] -// CHECK6-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK6-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK6: omp.inner.for.cond: -// CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 -// CHECK6-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] -// CHECK6-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK6: omp.inner.for.body: -// CHECK6-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK6-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 -// CHECK6-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK6-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP19:%.*]] = inttoptr i32 [[TMP14]] to i8* -// CHECK6-NEXT: store i8* [[TMP19]], i8** [[TMP18]], align 4 -// CHECK6-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP15]] to i8* -// CHECK6-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 -// CHECK6-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK6-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP17]] to i8* -// CHECK6-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 -// CHECK6-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK6-NEXT: [[TMP25:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* -// CHECK6-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 -// CHECK6-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 -// CHECK6-NEXT: [[TMP28:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP27]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP28]], i32 4) -// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK6: omp.inner.for.inc: -// CHECK6-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK6-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP29]], [[TMP30]] -// CHECK6-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK6-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK6-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] -// CHECK6-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK6-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK6-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] -// CHECK6-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK6-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] -// CHECK6-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] -// CHECK6: cond.true10: -// CHECK6-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK6-NEXT: br label [[COND_END12:%.*]] -// CHECK6: cond.false11: -// CHECK6-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: br label [[COND_END12]] -// CHECK6: cond.end12: -// CHECK6-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP37]], [[COND_TRUE10]] ], [ [[TMP38]], [[COND_FALSE11]] ] -// CHECK6-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK6-NEXT: store i32 [[TMP39]], i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK6: omp.inner.for.end: -// CHECK6-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK6: omp.loop.exit: -// CHECK6-NEXT: [[TMP40:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 -// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]]) -// CHECK6-NEXT: br label [[OMP_PRECOND_END]] -// CHECK6: omp.precond.end: -// CHECK6-NEXT: ret void -// -// -// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__3 -// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 -// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK6-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK6-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK6-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK6-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK6-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK6-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK6-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK6-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK6-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK6-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK6-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK6-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK6-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK6: omp.precond.then: -// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK6-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 -// CHECK6-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 -// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK6-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK6-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK6: omp.inner.for.cond: -// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK6-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] -// CHECK6-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK6: omp.inner.for.body: -// CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 -// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK6-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 -// CHECK6-NEXT: [[TMP13:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK6-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i32 0, i32 [[TMP13]] -// CHECK6-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 -// CHECK6-NEXT: [[CONV:%.*]] = sext i16 [[TMP14]] to i32 -// CHECK6-NEXT: [[ADD5:%.*]] = add nsw i32 [[CONV]], 1 -// CHECK6-NEXT: [[CONV6:%.*]] = trunc i32 [[ADD5]] to i16 -// CHECK6-NEXT: store i16 [[CONV6]], i16* [[ARRAYIDX]], align 2 -// CHECK6-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK6: omp.body.continue: -// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK6: omp.inner.for.inc: -// CHECK6-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK6-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] -// CHECK6-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK6: omp.inner.for.end: -// CHECK6-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK6: omp.loop.exit: -// CHECK6-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 -// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) -// CHECK6-NEXT: br label [[OMP_PRECOND_END]] -// CHECK6: omp.precond.end: -// CHECK6-NEXT: ret void -// -// -// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 -// CHECK6-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK6-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK6-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK6-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK6-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK6: .execute: -// CHECK6-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK6-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] -// CHECK6-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK6: .omp.deinit: -// CHECK6-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK6-NEXT: br label [[DOTEXIT:%.*]] -// CHECK6: .exit: -// CHECK6-NEXT: ret void -// -// -// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__4 -// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 -// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK6-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK6-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK6-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK6-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 -// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 -// CHECK6-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK6: cond.true: -// CHECK6-NEXT: br label [[COND_END:%.*]] -// CHECK6: cond.false: -// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: br label [[COND_END]] -// CHECK6: cond.end: -// CHECK6-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] -// CHECK6-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK6-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK6: omp.inner.for.cond: -// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 -// CHECK6-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK6: omp.inner.for.body: -// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK6-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP7]] to i8* -// CHECK6-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 -// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP8]] to i8* -// CHECK6-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 -// CHECK6-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK6-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* -// CHECK6-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 -// CHECK6-NEXT: [[TMP15:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP15]], i32 3) -// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK6: omp.inner.for.inc: -// CHECK6-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP17]] -// CHECK6-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK6-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK6-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] -// CHECK6-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK6-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK6-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] -// CHECK6-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP22]], 9 -// CHECK6-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] -// CHECK6: cond.true5: -// CHECK6-NEXT: br label [[COND_END7:%.*]] -// CHECK6: cond.false6: -// CHECK6-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: br label [[COND_END7]] -// CHECK6: cond.end7: -// CHECK6-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP23]], [[COND_FALSE6]] ] -// CHECK6-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK6-NEXT: store i32 [[TMP24]], i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK6: omp.inner.for.end: -// CHECK6-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK6: omp.loop.exit: -// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) -// CHECK6-NEXT: ret void -// -// -// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__5 -// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK6-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK6-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK6-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK6-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK6-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 -// CHECK6-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 -// CHECK6-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 -// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK6-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 -// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK6-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK6: omp.inner.for.cond: -// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK6-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] -// CHECK6-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK6: omp.inner.for.body: -// CHECK6-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 -// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK6-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 -// CHECK6-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 [[TMP9]] -// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -// CHECK6-NEXT: [[ADD1:%.*]] = add nsw i32 [[TMP10]], 1 -// CHECK6-NEXT: store i32 [[ADD1]], i32* [[ARRAYIDX]], align 4 -// CHECK6-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK6: omp.body.continue: -// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK6: omp.inner.for.inc: -// CHECK6-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK6-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] -// CHECK6-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK6: omp.inner.for.end: -// CHECK6-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK6: omp.loop.exit: -// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) -// CHECK6-NEXT: ret void -// -// -// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59 -// CHECK6-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK6-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK6-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 -// CHECK6-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK6-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK6-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK6: .execute: -// CHECK6-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[F_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP2]], i32* [[F_CASTED]], align 4 -// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[F_CASTED]], align 4 -// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK6-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i32 [[TMP3]]) #[[ATTR2]] -// CHECK6-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK6: .omp.deinit: -// CHECK6-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK6-NEXT: br label [[DOTEXIT:%.*]] -// CHECK6: .exit: -// CHECK6-NEXT: ret void -// -// -// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__6 -// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK6-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[K:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 -// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK6-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 -// CHECK6-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK6-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK6-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 -// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 -// CHECK6-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK6: cond.true: -// CHECK6-NEXT: br label [[COND_END:%.*]] -// CHECK6: cond.false: -// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: br label [[COND_END]] -// CHECK6: cond.end: -// CHECK6-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] -// CHECK6-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK6-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK6: omp.inner.for.cond: -// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 -// CHECK6-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK6: omp.inner.for.body: -// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK6-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[F_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP9]], i32* [[F_CASTED]], align 4 -// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[F_CASTED]], align 4 -// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP7]] to i8* -// CHECK6-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 -// CHECK6-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP14:%.*]] = inttoptr i32 [[TMP8]] to i8* -// CHECK6-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 -// CHECK6-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK6-NEXT: [[TMP16:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* -// CHECK6-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 4 -// CHECK6-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK6-NEXT: [[TMP18:%.*]] = inttoptr i32 [[TMP10]] to i8* -// CHECK6-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 4 -// CHECK6-NEXT: [[TMP19:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x [10 x i32]]*, i32)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP19]], i32 4) -// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK6: omp.inner.for.inc: -// CHECK6-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] -// CHECK6-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK6-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK6-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK6-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK6-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK6-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] -// CHECK6-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP26]], 99 -// CHECK6-NEXT: br i1 [[CMP5]], label [[COND_TRUE6:%.*]], label [[COND_FALSE7:%.*]] -// CHECK6: cond.true6: -// CHECK6-NEXT: br label [[COND_END8:%.*]] -// CHECK6: cond.false7: -// CHECK6-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: br label [[COND_END8]] -// CHECK6: cond.end8: -// CHECK6-NEXT: [[COND9:%.*]] = phi i32 [ 99, [[COND_TRUE6]] ], [ [[TMP27]], [[COND_FALSE7]] ] -// CHECK6-NEXT: store i32 [[COND9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK6-NEXT: store i32 [[TMP28]], i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK6: omp.inner.for.end: -// CHECK6-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK6: omp.loop.exit: -// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) -// CHECK6-NEXT: ret void -// -// -// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__7 -// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK6-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[K:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK6-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK6-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK6-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 -// CHECK6-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK6-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 -// CHECK6-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 -// CHECK6-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 -// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK6-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 -// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK6-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK6: omp.inner.for.cond: -// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK6-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] -// CHECK6-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK6: omp.inner.for.body: -// CHECK6-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 -// CHECK6-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 -// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK6-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[DIV2:%.*]] = sdiv i32 [[TMP10]], 10 -// CHECK6-NEXT: [[MUL3:%.*]] = mul nsw i32 [[DIV2]], 10 -// CHECK6-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL3]] -// CHECK6-NEXT: [[MUL4:%.*]] = mul nsw i32 [[SUB]], 1 -// CHECK6-NEXT: [[ADD5:%.*]] = add nsw i32 0, [[MUL4]] -// CHECK6-NEXT: store i32 [[ADD5]], i32* [[J]], align 4 -// CHECK6-NEXT: store i32 10, i32* [[K]], align 4 -// CHECK6-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 -// CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 -// CHECK6-NEXT: [[TMP13:%.*]] = load i32, i32* [[F_ADDR]], align 4 -// CHECK6-NEXT: [[MUL6:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] -// CHECK6-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP11]], [[MUL6]] -// CHECK6-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 -// CHECK6-NEXT: [[ADD8:%.*]] = add nsw i32 [[ADD7]], [[TMP14]] -// CHECK6-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 -// CHECK6-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP15]] -// CHECK6-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 -// CHECK6-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP16]] -// CHECK6-NEXT: store i32 [[ADD8]], i32* [[ARRAYIDX9]], align 4 -// CHECK6-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK6: omp.body.continue: -// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK6: omp.inner.for.inc: -// CHECK6-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK6-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] -// CHECK6-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK6: omp.inner.for.end: -// CHECK6-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK6: omp.loop.exit: -// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) -// CHECK6-NEXT: ret void -// -// -// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l67 -// CHECK6-SAME: (i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK6-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK6-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK6-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK6-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK6-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK6: .execute: -// CHECK6-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 -// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK6-NEXT: call void @__omp_outlined__8(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [10 x [10 x i32]]* [[TMP0]]) #[[ATTR2]] -// CHECK6-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK6: .omp.deinit: -// CHECK6-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK6-NEXT: br label [[DOTEXIT:%.*]] -// CHECK6: .exit: -// CHECK6-NEXT: ret void -// -// -// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__8 -// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 -// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 -// CHECK6-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i64, align 8 -// CHECK6-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i64, align 8 -// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 -// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[I9:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[J10:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 -// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK6-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK6-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK6-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK6-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK6-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 -// CHECK6-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK6-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64 -// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK6-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 -// CHECK6-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 -// CHECK6-NEXT: [[CONV6:%.*]] = sext i32 [[DIV5]] to i64 -// CHECK6-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV6]] -// CHECK6-NEXT: [[SUB7:%.*]] = sub nsw i64 [[MUL]], 1 -// CHECK6-NEXT: store i64 [[SUB7]], i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK6-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK6-NEXT: store i32 0, i32* [[J]], align 4 -// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK6-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] -// CHECK6-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK6: land.lhs.true: -// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK6-NEXT: [[CMP8:%.*]] = icmp slt i32 0, [[TMP6]] -// CHECK6-NEXT: br i1 [[CMP8]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] -// CHECK6: omp.precond.then: -// CHECK6-NEXT: store i64 0, i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK6-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK6-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK6-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK6-NEXT: [[CONV11:%.*]] = zext i32 [[NVPTX_NUM_THREADS]] to i64 -// CHECK6-NEXT: [[TMP8:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4 -// CHECK6-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB1]], i32 [[TMP9]], i32 91, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_COMB_LB]], i64* [[DOTOMP_COMB_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 [[CONV11]]) -// CHECK6-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK6-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK6-NEXT: [[CMP12:%.*]] = icmp sgt i64 [[TMP10]], [[TMP11]] -// CHECK6-NEXT: br i1 [[CMP12]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK6: cond.true: -// CHECK6-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK6-NEXT: br label [[COND_END:%.*]] -// CHECK6: cond.false: -// CHECK6-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK6-NEXT: br label [[COND_END]] -// CHECK6: cond.end: -// CHECK6-NEXT: [[COND:%.*]] = phi i64 [ [[TMP12]], [[COND_TRUE]] ], [ [[TMP13]], [[COND_FALSE]] ] -// CHECK6-NEXT: store i64 [[COND]], i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK6-NEXT: [[TMP14:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK6-NEXT: store i64 [[TMP14]], i64* [[DOTOMP_IV]], align 8 -// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK6: omp.inner.for.cond: -// CHECK6-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK6-NEXT: [[TMP16:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK6-NEXT: [[ADD:%.*]] = add nsw i64 [[TMP16]], 1 -// CHECK6-NEXT: [[CMP13:%.*]] = icmp slt i64 [[TMP15]], [[ADD]] -// CHECK6-NEXT: br i1 [[CMP13]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK6: omp.inner.for.body: -// CHECK6-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK6-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32 -// CHECK6-NEXT: [[TMP19:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK6-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32 -// CHECK6-NEXT: [[TMP21:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP21]], i32* [[N_CASTED]], align 4 -// CHECK6-NEXT: [[TMP22:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK6-NEXT: [[TMP23:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP18]] to i8* -// CHECK6-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 -// CHECK6-NEXT: [[TMP25:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP26:%.*]] = inttoptr i32 [[TMP20]] to i8* -// CHECK6-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 -// CHECK6-NEXT: [[TMP27:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK6-NEXT: [[TMP28:%.*]] = inttoptr i32 [[TMP22]] to i8* -// CHECK6-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 -// CHECK6-NEXT: [[TMP29:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK6-NEXT: [[TMP30:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* -// CHECK6-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 4 -// CHECK6-NEXT: [[TMP31:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: [[TMP32:%.*]] = load i32, i32* [[TMP31]], align 4 -// CHECK6-NEXT: [[TMP33:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP32]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [10 x [10 x i32]]*)* @__omp_outlined__9 to i8*), i8* null, i8** [[TMP33]], i32 4) -// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK6: omp.inner.for.inc: -// CHECK6-NEXT: [[TMP34:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK6-NEXT: [[TMP35:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK6-NEXT: [[ADD14:%.*]] = add nsw i64 [[TMP34]], [[TMP35]] -// CHECK6-NEXT: store i64 [[ADD14]], i64* [[DOTOMP_IV]], align 8 -// CHECK6-NEXT: [[TMP36:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK6-NEXT: [[TMP37:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK6-NEXT: [[ADD15:%.*]] = add nsw i64 [[TMP36]], [[TMP37]] -// CHECK6-NEXT: store i64 [[ADD15]], i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK6-NEXT: [[TMP38:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK6-NEXT: [[TMP39:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK6-NEXT: [[ADD16:%.*]] = add nsw i64 [[TMP38]], [[TMP39]] -// CHECK6-NEXT: store i64 [[ADD16]], i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK6-NEXT: [[TMP40:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK6-NEXT: [[TMP41:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK6-NEXT: [[CMP17:%.*]] = icmp sgt i64 [[TMP40]], [[TMP41]] -// CHECK6-NEXT: br i1 [[CMP17]], label [[COND_TRUE18:%.*]], label [[COND_FALSE19:%.*]] -// CHECK6: cond.true18: -// CHECK6-NEXT: [[TMP42:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK6-NEXT: br label [[COND_END20:%.*]] -// CHECK6: cond.false19: -// CHECK6-NEXT: [[TMP43:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK6-NEXT: br label [[COND_END20]] -// CHECK6: cond.end20: -// CHECK6-NEXT: [[COND21:%.*]] = phi i64 [ [[TMP42]], [[COND_TRUE18]] ], [ [[TMP43]], [[COND_FALSE19]] ] -// CHECK6-NEXT: store i64 [[COND21]], i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK6-NEXT: [[TMP44:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK6-NEXT: store i64 [[TMP44]], i64* [[DOTOMP_IV]], align 8 -// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK6: omp.inner.for.end: -// CHECK6-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK6: omp.loop.exit: -// CHECK6-NEXT: [[TMP45:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: [[TMP46:%.*]] = load i32, i32* [[TMP45]], align 4 -// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP46]]) -// CHECK6-NEXT: br label [[OMP_PRECOND_END]] -// CHECK6: omp.precond.end: -// CHECK6-NEXT: ret void -// -// -// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__9 -// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 -// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 -// CHECK6-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_LB:%.*]] = alloca i64, align 8 -// CHECK6-NEXT: [[DOTOMP_UB:%.*]] = alloca i64, align 8 -// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 -// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[I11:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[J12:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK6-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK6-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK6-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK6-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK6-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK6-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK6-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 -// CHECK6-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK6-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64 -// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK6-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 -// CHECK6-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 -// CHECK6-NEXT: [[CONV6:%.*]] = sext i32 [[DIV5]] to i64 -// CHECK6-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV6]] -// CHECK6-NEXT: [[SUB7:%.*]] = sub nsw i64 [[MUL]], 1 -// CHECK6-NEXT: store i64 [[SUB7]], i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK6-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK6-NEXT: store i32 0, i32* [[J]], align 4 -// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK6-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] -// CHECK6-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK6: land.lhs.true: -// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK6-NEXT: [[CMP8:%.*]] = icmp slt i32 0, [[TMP6]] -// CHECK6-NEXT: br i1 [[CMP8]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] -// CHECK6: omp.precond.then: -// CHECK6-NEXT: store i64 0, i64* [[DOTOMP_LB]], align 8 -// CHECK6-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK6-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_UB]], align 8 -// CHECK6-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK6-NEXT: [[CONV9:%.*]] = zext i32 [[TMP8]] to i64 -// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK6-NEXT: [[CONV10:%.*]] = zext i32 [[TMP9]] to i64 -// CHECK6-NEXT: store i64 [[CONV9]], i64* [[DOTOMP_LB]], align 8 -// CHECK6-NEXT: store i64 [[CONV10]], i64* [[DOTOMP_UB]], align 8 -// CHECK6-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK6-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 -// CHECK6-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB2]], i32 [[TMP11]], i32 33, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_LB]], i64* [[DOTOMP_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 1) -// CHECK6-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTOMP_LB]], align 8 -// CHECK6-NEXT: store i64 [[TMP12]], i64* [[DOTOMP_IV]], align 8 -// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK6: omp.inner.for.cond: -// CHECK6-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK6-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK6-NEXT: [[CONV13:%.*]] = zext i32 [[TMP14]] to i64 -// CHECK6-NEXT: [[CMP14:%.*]] = icmp sle i64 [[TMP13]], [[CONV13]] -// CHECK6-NEXT: br i1 [[CMP14]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK6: omp.inner.for.body: -// CHECK6-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK6-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK6-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP16]], 0 -// CHECK6-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1 -// CHECK6-NEXT: [[MUL17:%.*]] = mul nsw i32 1, [[DIV16]] -// CHECK6-NEXT: [[CONV18:%.*]] = sext i32 [[MUL17]] to i64 -// CHECK6-NEXT: [[DIV19:%.*]] = sdiv i64 [[TMP15]], [[CONV18]] -// CHECK6-NEXT: [[MUL20:%.*]] = mul nsw i64 [[DIV19]], 1 -// CHECK6-NEXT: [[ADD:%.*]] = add nsw i64 0, [[MUL20]] -// CHECK6-NEXT: [[CONV21:%.*]] = trunc i64 [[ADD]] to i32 -// CHECK6-NEXT: store i32 [[CONV21]], i32* [[I11]], align 4 -// CHECK6-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK6-NEXT: [[TMP18:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK6-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK6-NEXT: [[SUB22:%.*]] = sub nsw i32 [[TMP19]], 0 -// CHECK6-NEXT: [[DIV23:%.*]] = sdiv i32 [[SUB22]], 1 -// CHECK6-NEXT: [[MUL24:%.*]] = mul nsw i32 1, [[DIV23]] -// CHECK6-NEXT: [[CONV25:%.*]] = sext i32 [[MUL24]] to i64 -// CHECK6-NEXT: [[DIV26:%.*]] = sdiv i64 [[TMP18]], [[CONV25]] -// CHECK6-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK6-NEXT: [[SUB27:%.*]] = sub nsw i32 [[TMP20]], 0 -// CHECK6-NEXT: [[DIV28:%.*]] = sdiv i32 [[SUB27]], 1 -// CHECK6-NEXT: [[MUL29:%.*]] = mul nsw i32 1, [[DIV28]] -// CHECK6-NEXT: [[CONV30:%.*]] = sext i32 [[MUL29]] to i64 -// CHECK6-NEXT: [[MUL31:%.*]] = mul nsw i64 [[DIV26]], [[CONV30]] -// CHECK6-NEXT: [[SUB32:%.*]] = sub nsw i64 [[TMP17]], [[MUL31]] -// CHECK6-NEXT: [[MUL33:%.*]] = mul nsw i64 [[SUB32]], 1 -// CHECK6-NEXT: [[ADD34:%.*]] = add nsw i64 0, [[MUL33]] -// CHECK6-NEXT: [[CONV35:%.*]] = trunc i64 [[ADD34]] to i32 -// CHECK6-NEXT: store i32 [[CONV35]], i32* [[J12]], align 4 -// CHECK6-NEXT: [[TMP21:%.*]] = load i32, i32* [[I11]], align 4 -// CHECK6-NEXT: [[TMP22:%.*]] = load i32, i32* [[J12]], align 4 -// CHECK6-NEXT: [[ADD36:%.*]] = add nsw i32 [[TMP21]], [[TMP22]] -// CHECK6-NEXT: [[TMP23:%.*]] = load i32, i32* [[I11]], align 4 -// CHECK6-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP23]] -// CHECK6-NEXT: [[TMP24:%.*]] = load i32, i32* [[J12]], align 4 -// CHECK6-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP24]] -// CHECK6-NEXT: store i32 [[ADD36]], i32* [[ARRAYIDX37]], align 4 -// CHECK6-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK6: omp.body.continue: -// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK6: omp.inner.for.inc: -// CHECK6-NEXT: [[TMP25:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK6-NEXT: [[TMP26:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK6-NEXT: [[ADD38:%.*]] = add nsw i64 [[TMP25]], [[TMP26]] -// CHECK6-NEXT: store i64 [[ADD38]], i64* [[DOTOMP_IV]], align 8 -// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK6: omp.inner.for.end: -// CHECK6-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK6: omp.loop.exit: -// CHECK6-NEXT: [[TMP27:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP27]], align 4 -// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP28]]) -// CHECK6-NEXT: br label [[OMP_PRECOND_END]] -// CHECK6: omp.precond.end: -// CHECK6-NEXT: ret void -// -// -// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74 -// CHECK6-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK6-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK6-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK6-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 -// CHECK6-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK6-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK6-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK6: .execute: -// CHECK6-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 -// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK6-NEXT: [[TMP4:%.*]] = load i32*, i32** [[V_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK6-NEXT: call void @__omp_outlined__10(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32* [[TMP4]]) #[[ATTR2]] -// CHECK6-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK6: .omp.deinit: -// CHECK6-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK6-NEXT: br label [[DOTEXIT:%.*]] -// CHECK6: .exit: -// CHECK6-NEXT: ret void -// -// -// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__10 -// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK6-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 -// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK6-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK6-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK6-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 -// CHECK6-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK6-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK6-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK6-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK6-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK6-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK6-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK6-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK6-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK6: omp.precond.then: -// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK6-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK6-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 -// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK6-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] -// CHECK6-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK6: cond.true: -// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK6-NEXT: br label [[COND_END:%.*]] -// CHECK6: cond.false: -// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: br label [[COND_END]] -// CHECK6: cond.end: -// CHECK6-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] -// CHECK6-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK6-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK6: omp.inner.for.cond: -// CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 -// CHECK6-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] -// CHECK6-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK6: omp.inner.for.body: -// CHECK6-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK6-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 -// CHECK6-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK6-NEXT: [[TMP18:%.*]] = load i32*, i32** [[V_ADDR]], align 4 -// CHECK6-NEXT: [[TMP19:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP20:%.*]] = inttoptr i32 [[TMP14]] to i8* -// CHECK6-NEXT: store i8* [[TMP20]], i8** [[TMP19]], align 4 -// CHECK6-NEXT: [[TMP21:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP22:%.*]] = inttoptr i32 [[TMP15]] to i8* -// CHECK6-NEXT: store i8* [[TMP22]], i8** [[TMP21]], align 4 -// CHECK6-NEXT: [[TMP23:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK6-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP17]] to i8* -// CHECK6-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 -// CHECK6-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK6-NEXT: [[TMP26:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* -// CHECK6-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 -// CHECK6-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 -// CHECK6-NEXT: [[TMP28:%.*]] = bitcast i32* [[TMP18]] to i8* -// CHECK6-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 -// CHECK6-NEXT: [[TMP29:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: [[TMP30:%.*]] = load i32, i32* [[TMP29]], align 4 -// CHECK6-NEXT: [[TMP31:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP30]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32*)* @__omp_outlined__11 to i8*), i8* null, i8** [[TMP31]], i32 5) -// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK6: omp.inner.for.inc: -// CHECK6-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK6-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP32]], [[TMP33]] -// CHECK6-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK6-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK6-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP34]], [[TMP35]] -// CHECK6-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK6-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK6-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP36]], [[TMP37]] -// CHECK6-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK6-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP38]], [[TMP39]] -// CHECK6-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] -// CHECK6: cond.true10: -// CHECK6-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK6-NEXT: br label [[COND_END12:%.*]] -// CHECK6: cond.false11: -// CHECK6-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: br label [[COND_END12]] -// CHECK6: cond.end12: -// CHECK6-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP40]], [[COND_TRUE10]] ], [ [[TMP41]], [[COND_FALSE11]] ] -// CHECK6-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK6-NEXT: store i32 [[TMP42]], i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK6: omp.inner.for.end: -// CHECK6-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK6: omp.loop.exit: -// CHECK6-NEXT: [[TMP43:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: [[TMP44:%.*]] = load i32, i32* [[TMP43]], align 4 -// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP44]]) -// CHECK6-NEXT: br label [[OMP_PRECOND_END]] -// CHECK6: omp.precond.end: -// CHECK6-NEXT: ret void -// -// -// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__11 -// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK6-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK6-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK6-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK6-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK6-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK6-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 -// CHECK6-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK6-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK6-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK6-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK6-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK6-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK6-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK6-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK6-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK6: omp.precond.then: -// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK6-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 -// CHECK6-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 -// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK6-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK6-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK6: omp.inner.for.cond: -// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK6-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] -// CHECK6-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK6: omp.inner.for.body: -// CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 -// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK6-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 -// CHECK6-NEXT: [[TMP13:%.*]] = load i32*, i32** [[V_ADDR]], align 4 -// CHECK6-NEXT: [[TMP14:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK6-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TMP13]], i32 [[TMP14]] -// CHECK6-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -// CHECK6-NEXT: [[TMP16:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK6-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP16]] -// CHECK6-NEXT: store i32 [[TMP15]], i32* [[ARRAYIDX5]], align 4 -// CHECK6-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK6: omp.body.continue: -// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK6: omp.inner.for.inc: -// CHECK6-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK6-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] -// CHECK6-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK6: omp.inner.for.end: -// CHECK6-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK6: omp.loop.exit: -// CHECK6-NEXT: [[TMP19:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4 -// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP20]]) -// CHECK6-NEXT: br label [[OMP_PRECOND_END]] -// CHECK6: omp.precond.end: -// CHECK6-NEXT: ret void -// -// -// CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43 -// CHECK7-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] { -// CHECK7-NEXT: entry: -// CHECK7-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK7-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK7-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK7-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK7-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 -// CHECK7-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK7-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK7-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK7-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK7: .execute: -// CHECK7-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) -// CHECK7-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK7-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 -// CHECK7-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK7-NEXT: [[TMP4:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK7-NEXT: store i32 [[TMP4]], i32* [[L_CASTED]], align 4 -// CHECK7-NEXT: [[TMP5:%.*]] = load i32, i32* [[L_CASTED]], align 4 -// CHECK7-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK7-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32 [[TMP5]]) #[[ATTR2:[0-9]+]] -// CHECK7-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK7: .omp.deinit: -// CHECK7-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK7-NEXT: br label [[DOTEXIT:%.*]] -// CHECK7: .exit: -// CHECK7-NEXT: ret void -// -// -// CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { -// CHECK7-NEXT: entry: -// CHECK7-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK7-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK7-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK7-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[I4:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 -// CHECK7-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK7-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK7-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK7-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK7-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 -// CHECK7-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK7-NEXT: [[TMP1:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 4, i16 1) -// CHECK7-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to %struct._globalized_locals_ty* -// CHECK7-NEXT: [[L1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP2]], i32 0, i32 0 -// CHECK7-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK7-NEXT: store i32 [[TMP3]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK7-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK7-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP4]], 0 -// CHECK7-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK7-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK7-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK7-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK7-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK7-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] -// CHECK7-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK7: omp.precond.then: -// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK7-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK7-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK7-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK7-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK7-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK7-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP8]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) -// CHECK7-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK7-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK7-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]] -// CHECK7-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK7: cond.true: -// CHECK7-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK7-NEXT: br label [[COND_END:%.*]] -// CHECK7: cond.false: -// CHECK7-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK7-NEXT: br label [[COND_END]] -// CHECK7: cond.end: -// CHECK7-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] -// CHECK7-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK7-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK7-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 -// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK7: omp.inner.for.cond: -// CHECK7-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK7-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK7-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP15]], 1 -// CHECK7-NEXT: [[CMP6:%.*]] = icmp slt i32 [[TMP14]], [[ADD]] -// CHECK7-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK7: omp.inner.for.body: -// CHECK7-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK7-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK7-NEXT: [[TMP18:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK7-NEXT: store i32 [[TMP18]], i32* [[N_CASTED]], align 4 -// CHECK7-NEXT: [[TMP19:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK7-NEXT: [[TMP20:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK7-NEXT: store i32 [[TMP20]], i32* [[L_CASTED]], align 4 -// CHECK7-NEXT: [[TMP21:%.*]] = load i32, i32* [[L_CASTED]], align 4 -// CHECK7-NEXT: [[TMP22:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK7-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP16]] to i8* -// CHECK7-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 -// CHECK7-NEXT: [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK7-NEXT: [[TMP25:%.*]] = inttoptr i32 [[TMP17]] to i8* -// CHECK7-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 -// CHECK7-NEXT: [[TMP26:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK7-NEXT: [[TMP27:%.*]] = inttoptr i32 [[TMP19]] to i8* -// CHECK7-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 4 -// CHECK7-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK7-NEXT: [[TMP29:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* -// CHECK7-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 4 -// CHECK7-NEXT: [[TMP30:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 -// CHECK7-NEXT: [[TMP31:%.*]] = inttoptr i32 [[TMP21]] to i8* -// CHECK7-NEXT: store i8* [[TMP31]], i8** [[TMP30]], align 4 -// CHECK7-NEXT: [[TMP32:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK7-NEXT: [[TMP33:%.*]] = load i32, i32* [[TMP32]], align 4 -// CHECK7-NEXT: [[TMP34:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK7-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP33]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP34]], i32 5) -// CHECK7-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK7: omp.inner.for.inc: -// CHECK7-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK7-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK7-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] -// CHECK7-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK7-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK7-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK7-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP37]], [[TMP38]] -// CHECK7-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK7-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK7-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK7-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP39]], [[TMP40]] -// CHECK7-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK7-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK7-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK7-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]] -// CHECK7-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] -// CHECK7: cond.true11: -// CHECK7-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK7-NEXT: br label [[COND_END13:%.*]] -// CHECK7: cond.false12: -// CHECK7-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK7-NEXT: br label [[COND_END13]] -// CHECK7: cond.end13: -// CHECK7-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP43]], [[COND_TRUE11]] ], [ [[TMP44]], [[COND_FALSE12]] ] -// CHECK7-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK7-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK7-NEXT: store i32 [[TMP45]], i32* [[DOTOMP_IV]], align 4 -// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK7: omp.inner.for.end: -// CHECK7-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK7: omp.loop.exit: -// CHECK7-NEXT: [[TMP46:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK7-NEXT: [[TMP47:%.*]] = load i32, i32* [[TMP46]], align 4 -// CHECK7-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP47]]) -// CHECK7-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK7-NEXT: [[TMP49:%.*]] = icmp ne i32 [[TMP48]], 0 -// CHECK7-NEXT: br i1 [[TMP49]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK7: .omp.lastprivate.then: -// CHECK7-NEXT: [[TMP50:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK7-NEXT: store i32 [[TMP50]], i32* [[L_ADDR]], align 4 -// CHECK7-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK7: .omp.lastprivate.done: -// CHECK7-NEXT: br label [[OMP_PRECOND_END]] -// CHECK7: omp.precond.end: -// CHECK7-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP1]]) -// CHECK7-NEXT: ret void -// -// -// CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { -// CHECK7-NEXT: entry: -// CHECK7-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK7-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK7-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK7-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK7-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK7-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK7-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK7-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK7-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK7-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 -// CHECK7-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK7-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK7-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK7-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK7-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK7-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK7-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK7-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK7-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK7-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK7-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK7-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK7: omp.precond.then: -// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK7-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK7-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK7-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK7-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK7-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 -// CHECK7-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 -// CHECK7-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK7-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK7-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK7-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) -// CHECK7-NEXT: br label [[OMP_DISPATCH_COND:%.*]] -// CHECK7: omp.dispatch.cond: -// CHECK7-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK7-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK7-NEXT: [[CMP4:%.*]] = icmp ugt i32 [[TMP9]], [[TMP10]] -// CHECK7-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK7: cond.true: -// CHECK7-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK7-NEXT: br label [[COND_END:%.*]] -// CHECK7: cond.false: -// CHECK7-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK7-NEXT: br label [[COND_END]] -// CHECK7: cond.end: -// CHECK7-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] -// CHECK7-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 -// CHECK7-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK7-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 -// CHECK7-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK7-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK7-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] -// CHECK7-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] -// CHECK7: omp.dispatch.body: -// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK7: omp.inner.for.cond: -// CHECK7-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK7-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK7-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] -// CHECK7-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK7: omp.inner.for.body: -// CHECK7-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK7-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 -// CHECK7-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK7-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 -// CHECK7-NEXT: [[TMP19:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK7-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP19]] -// CHECK7-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 -// CHECK7-NEXT: [[TMP20:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK7-NEXT: store i32 [[TMP20]], i32* [[L_ADDR]], align 4 -// CHECK7-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK7: omp.body.continue: -// CHECK7-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK7: omp.inner.for.inc: -// CHECK7-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK7-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP21]], 1 -// CHECK7-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK7: omp.inner.for.end: -// CHECK7-NEXT: br label [[OMP_DISPATCH_INC:%.*]] -// CHECK7: omp.dispatch.inc: -// CHECK7-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK7-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK7-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK7-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_LB]], align 4 -// CHECK7-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK7-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK7-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] -// CHECK7-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_UB]], align 4 -// CHECK7-NEXT: br label [[OMP_DISPATCH_COND]] -// CHECK7: omp.dispatch.end: -// CHECK7-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK7-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 -// CHECK7-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) -// CHECK7-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK7-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 -// CHECK7-NEXT: br i1 [[TMP29]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK7: .omp.lastprivate.then: -// CHECK7-NEXT: [[TMP30:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK7-NEXT: store i32 [[TMP30]], i32* [[L_ADDR]], align 4 -// CHECK7-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK7: .omp.lastprivate.done: -// CHECK7-NEXT: br label [[OMP_PRECOND_END]] -// CHECK7: omp.precond.end: -// CHECK7-NEXT: ret void -// -// -// CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 -// CHECK7-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK7-NEXT: entry: -// CHECK7-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 -// CHECK7-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK7-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK7-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK7-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK7-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK7-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK7-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK7: .execute: -// CHECK7-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK7-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK7-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 -// CHECK7-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK7-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK7-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] -// CHECK7-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK7: .omp.deinit: -// CHECK7-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK7-NEXT: br label [[DOTEXIT:%.*]] -// CHECK7: .exit: -// CHECK7-NEXT: ret void -// -// -// CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__2 -// CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK7-NEXT: entry: -// CHECK7-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK7-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK7-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 -// CHECK7-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 -// CHECK7-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK7-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK7-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK7-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK7-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK7-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK7-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK7-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK7-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK7-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK7-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK7-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK7-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK7-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK7-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK7-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK7: omp.precond.then: -// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK7-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK7-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK7-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK7-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK7-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK7-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 -// CHECK7-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK7-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK7-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK7-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] -// CHECK7-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK7: cond.true: -// CHECK7-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK7-NEXT: br label [[COND_END:%.*]] -// CHECK7: cond.false: -// CHECK7-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK7-NEXT: br label [[COND_END]] -// CHECK7: cond.end: -// CHECK7-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] -// CHECK7-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK7-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK7-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 -// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK7: omp.inner.for.cond: -// CHECK7-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK7-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK7-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 -// CHECK7-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] -// CHECK7-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK7: omp.inner.for.body: -// CHECK7-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK7-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK7-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK7-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 -// CHECK7-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK7-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK7-NEXT: [[TMP19:%.*]] = inttoptr i32 [[TMP14]] to i8* -// CHECK7-NEXT: store i8* [[TMP19]], i8** [[TMP18]], align 4 -// CHECK7-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK7-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP15]] to i8* -// CHECK7-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 -// CHECK7-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK7-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP17]] to i8* -// CHECK7-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 -// CHECK7-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK7-NEXT: [[TMP25:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* -// CHECK7-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 -// CHECK7-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK7-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 -// CHECK7-NEXT: [[TMP28:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK7-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP27]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP28]], i32 4) -// CHECK7-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK7: omp.inner.for.inc: -// CHECK7-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK7-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK7-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP29]], [[TMP30]] -// CHECK7-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 -// CHECK7-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK7-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK7-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] -// CHECK7-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK7-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK7-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK7-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] -// CHECK7-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK7-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK7-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK7-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] -// CHECK7-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] -// CHECK7: cond.true10: -// CHECK7-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK7-NEXT: br label [[COND_END12:%.*]] -// CHECK7: cond.false11: -// CHECK7-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK7-NEXT: br label [[COND_END12]] -// CHECK7: cond.end12: -// CHECK7-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP37]], [[COND_TRUE10]] ], [ [[TMP38]], [[COND_FALSE11]] ] -// CHECK7-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK7-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK7-NEXT: store i32 [[TMP39]], i32* [[DOTOMP_IV]], align 4 -// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK7: omp.inner.for.end: -// CHECK7-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK7: omp.loop.exit: -// CHECK7-NEXT: [[TMP40:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK7-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 -// CHECK7-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]]) -// CHECK7-NEXT: br label [[OMP_PRECOND_END]] -// CHECK7: omp.precond.end: -// CHECK7-NEXT: ret void -// -// -// CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__3 -// CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK7-NEXT: entry: -// CHECK7-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK7-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK7-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 -// CHECK7-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK7-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK7-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK7-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK7-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK7-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK7-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK7-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK7-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK7-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK7-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK7-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK7-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK7-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK7-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK7-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK7-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK7-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK7: omp.precond.then: -// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK7-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK7-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK7-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK7-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK7-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 -// CHECK7-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 -// CHECK7-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK7-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK7-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK7-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK7-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK7-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 -// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK7: omp.inner.for.cond: -// CHECK7-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK7-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK7-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] -// CHECK7-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK7: omp.inner.for.body: -// CHECK7-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK7-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 -// CHECK7-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK7-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 -// CHECK7-NEXT: [[TMP13:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK7-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i32 0, i32 [[TMP13]] -// CHECK7-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 -// CHECK7-NEXT: [[CONV:%.*]] = sext i16 [[TMP14]] to i32 -// CHECK7-NEXT: [[ADD5:%.*]] = add nsw i32 [[CONV]], 1 -// CHECK7-NEXT: [[CONV6:%.*]] = trunc i32 [[ADD5]] to i16 -// CHECK7-NEXT: store i16 [[CONV6]], i16* [[ARRAYIDX]], align 2 -// CHECK7-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK7: omp.body.continue: -// CHECK7-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK7: omp.inner.for.inc: -// CHECK7-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK7-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK7-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] -// CHECK7-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK7: omp.inner.for.end: -// CHECK7-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK7: omp.loop.exit: -// CHECK7-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK7-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 -// CHECK7-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) -// CHECK7-NEXT: br label [[OMP_PRECOND_END]] -// CHECK7: omp.precond.end: -// CHECK7-NEXT: ret void -// -// -// CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 -// CHECK7-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK7-NEXT: entry: -// CHECK7-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK7-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK7-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK7-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK7-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK7-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK7-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK7: .execute: -// CHECK7-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK7-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK7-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] -// CHECK7-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK7: .omp.deinit: -// CHECK7-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK7-NEXT: br label [[DOTEXIT:%.*]] -// CHECK7: .exit: -// CHECK7-NEXT: ret void -// -// -// CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__4 -// CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK7-NEXT: entry: -// CHECK7-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK7-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK7-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK7-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 -// CHECK7-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK7-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK7-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK7-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK7-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK7-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK7-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK7-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK7-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 -// CHECK7-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK7-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK7-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 -// CHECK7-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK7: cond.true: -// CHECK7-NEXT: br label [[COND_END:%.*]] -// CHECK7: cond.false: -// CHECK7-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK7-NEXT: br label [[COND_END]] -// CHECK7: cond.end: -// CHECK7-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] -// CHECK7-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK7-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK7-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK7: omp.inner.for.cond: -// CHECK7-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK7-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 -// CHECK7-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK7: omp.inner.for.body: -// CHECK7-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK7-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK7-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK7-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP7]] to i8* -// CHECK7-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 -// CHECK7-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK7-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP8]] to i8* -// CHECK7-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 -// CHECK7-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK7-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* -// CHECK7-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 -// CHECK7-NEXT: [[TMP15:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK7-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP15]], i32 3) -// CHECK7-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK7: omp.inner.for.inc: -// CHECK7-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK7-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK7-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP17]] -// CHECK7-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 -// CHECK7-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK7-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK7-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] -// CHECK7-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK7-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK7-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK7-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] -// CHECK7-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK7-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK7-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP22]], 9 -// CHECK7-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] -// CHECK7: cond.true5: -// CHECK7-NEXT: br label [[COND_END7:%.*]] -// CHECK7: cond.false6: -// CHECK7-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK7-NEXT: br label [[COND_END7]] -// CHECK7: cond.end7: -// CHECK7-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP23]], [[COND_FALSE6]] ] -// CHECK7-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK7-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK7-NEXT: store i32 [[TMP24]], i32* [[DOTOMP_IV]], align 4 -// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK7: omp.inner.for.end: -// CHECK7-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK7: omp.loop.exit: -// CHECK7-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) -// CHECK7-NEXT: ret void -// -// -// CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__5 -// CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK7-NEXT: entry: -// CHECK7-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK7-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK7-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK7-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK7-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK7-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK7-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK7-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK7-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK7-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 -// CHECK7-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK7-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK7-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 -// CHECK7-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 -// CHECK7-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK7-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK7-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 -// CHECK7-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK7-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK7-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK7: omp.inner.for.cond: -// CHECK7-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK7-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK7-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] -// CHECK7-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK7: omp.inner.for.body: -// CHECK7-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK7-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 -// CHECK7-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK7-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK7-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 -// CHECK7-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 [[TMP9]] -// CHECK7-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -// CHECK7-NEXT: [[ADD1:%.*]] = add nsw i32 [[TMP10]], 1 -// CHECK7-NEXT: store i32 [[ADD1]], i32* [[ARRAYIDX]], align 4 -// CHECK7-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK7: omp.body.continue: -// CHECK7-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK7: omp.inner.for.inc: -// CHECK7-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK7-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK7-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] -// CHECK7-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 -// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK7: omp.inner.for.end: -// CHECK7-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK7: omp.loop.exit: -// CHECK7-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) -// CHECK7-NEXT: ret void -// -// -// CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59 -// CHECK7-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { -// CHECK7-NEXT: entry: -// CHECK7-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK7-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK7-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK7-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 -// CHECK7-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK7-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK7-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK7-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK7: .execute: -// CHECK7-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK7-NEXT: [[TMP2:%.*]] = load i32, i32* [[F_ADDR]], align 4 -// CHECK7-NEXT: store i32 [[TMP2]], i32* [[F_CASTED]], align 4 -// CHECK7-NEXT: [[TMP3:%.*]] = load i32, i32* [[F_CASTED]], align 4 -// CHECK7-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK7-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i32 [[TMP3]]) #[[ATTR2]] -// CHECK7-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK7: .omp.deinit: -// CHECK7-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK7-NEXT: br label [[DOTEXIT:%.*]] -// CHECK7: .exit: -// CHECK7-NEXT: ret void -// -// -// CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__6 -// CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { -// CHECK7-NEXT: entry: -// CHECK7-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK7-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK7-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK7-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[K:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 -// CHECK7-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK7-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK7-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK7-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 -// CHECK7-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK7-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK7-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK7-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK7-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK7-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 -// CHECK7-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK7-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK7-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 -// CHECK7-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK7: cond.true: -// CHECK7-NEXT: br label [[COND_END:%.*]] -// CHECK7: cond.false: -// CHECK7-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK7-NEXT: br label [[COND_END]] -// CHECK7: cond.end: -// CHECK7-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] -// CHECK7-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK7-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK7-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK7: omp.inner.for.cond: -// CHECK7-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK7-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 -// CHECK7-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK7: omp.inner.for.body: -// CHECK7-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK7-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK7-NEXT: [[TMP9:%.*]] = load i32, i32* [[F_ADDR]], align 4 -// CHECK7-NEXT: store i32 [[TMP9]], i32* [[F_CASTED]], align 4 -// CHECK7-NEXT: [[TMP10:%.*]] = load i32, i32* [[F_CASTED]], align 4 -// CHECK7-NEXT: [[TMP11:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK7-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP7]] to i8* -// CHECK7-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 -// CHECK7-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK7-NEXT: [[TMP14:%.*]] = inttoptr i32 [[TMP8]] to i8* -// CHECK7-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 -// CHECK7-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK7-NEXT: [[TMP16:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* -// CHECK7-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 4 -// CHECK7-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK7-NEXT: [[TMP18:%.*]] = inttoptr i32 [[TMP10]] to i8* -// CHECK7-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 4 -// CHECK7-NEXT: [[TMP19:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK7-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x [10 x i32]]*, i32)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP19]], i32 4) -// CHECK7-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK7: omp.inner.for.inc: -// CHECK7-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK7-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK7-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] -// CHECK7-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 -// CHECK7-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK7-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK7-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK7-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK7-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK7-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK7-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] -// CHECK7-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK7-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK7-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP26]], 99 -// CHECK7-NEXT: br i1 [[CMP5]], label [[COND_TRUE6:%.*]], label [[COND_FALSE7:%.*]] -// CHECK7: cond.true6: -// CHECK7-NEXT: br label [[COND_END8:%.*]] -// CHECK7: cond.false7: -// CHECK7-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK7-NEXT: br label [[COND_END8]] -// CHECK7: cond.end8: -// CHECK7-NEXT: [[COND9:%.*]] = phi i32 [ 99, [[COND_TRUE6]] ], [ [[TMP27]], [[COND_FALSE7]] ] -// CHECK7-NEXT: store i32 [[COND9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK7-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK7-NEXT: store i32 [[TMP28]], i32* [[DOTOMP_IV]], align 4 -// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK7: omp.inner.for.end: -// CHECK7-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK7: omp.loop.exit: -// CHECK7-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) -// CHECK7-NEXT: ret void -// -// -// CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__7 -// CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { -// CHECK7-NEXT: entry: -// CHECK7-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK7-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK7-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK7-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[K:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK7-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK7-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK7-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK7-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK7-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 -// CHECK7-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK7-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 -// CHECK7-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK7-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK7-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 -// CHECK7-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 -// CHECK7-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK7-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK7-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 -// CHECK7-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK7-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK7-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK7: omp.inner.for.cond: -// CHECK7-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK7-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK7-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] -// CHECK7-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK7: omp.inner.for.body: -// CHECK7-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK7-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 -// CHECK7-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 -// CHECK7-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK7-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK7-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK7-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK7-NEXT: [[DIV2:%.*]] = sdiv i32 [[TMP10]], 10 -// CHECK7-NEXT: [[MUL3:%.*]] = mul nsw i32 [[DIV2]], 10 -// CHECK7-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL3]] -// CHECK7-NEXT: [[MUL4:%.*]] = mul nsw i32 [[SUB]], 1 -// CHECK7-NEXT: [[ADD5:%.*]] = add nsw i32 0, [[MUL4]] -// CHECK7-NEXT: store i32 [[ADD5]], i32* [[J]], align 4 -// CHECK7-NEXT: store i32 10, i32* [[K]], align 4 -// CHECK7-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 -// CHECK7-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 -// CHECK7-NEXT: [[TMP13:%.*]] = load i32, i32* [[F_ADDR]], align 4 -// CHECK7-NEXT: [[MUL6:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] -// CHECK7-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP11]], [[MUL6]] -// CHECK7-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 -// CHECK7-NEXT: [[ADD8:%.*]] = add nsw i32 [[ADD7]], [[TMP14]] -// CHECK7-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 -// CHECK7-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP15]] -// CHECK7-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 -// CHECK7-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP16]] -// CHECK7-NEXT: store i32 [[ADD8]], i32* [[ARRAYIDX9]], align 4 -// CHECK7-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK7: omp.body.continue: -// CHECK7-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK7: omp.inner.for.inc: -// CHECK7-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK7-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK7-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] -// CHECK7-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 -// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK7: omp.inner.for.end: -// CHECK7-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK7: omp.loop.exit: -// CHECK7-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) -// CHECK7-NEXT: ret void -// -// -// CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l67 -// CHECK7-SAME: (i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { -// CHECK7-NEXT: entry: -// CHECK7-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK7-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK7-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK7-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK7-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK7-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK7-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK7-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK7: .execute: -// CHECK7-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK7-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK7-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 -// CHECK7-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK7-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK7-NEXT: call void @__omp_outlined__8(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [10 x [10 x i32]]* [[TMP0]]) #[[ATTR2]] -// CHECK7-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK7: .omp.deinit: -// CHECK7-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK7-NEXT: br label [[DOTEXIT:%.*]] -// CHECK7: .exit: -// CHECK7-NEXT: ret void -// -// -// CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__8 -// CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { -// CHECK7-NEXT: entry: -// CHECK7-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK7-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK7-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK7-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 -// CHECK7-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 -// CHECK7-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i64, align 8 -// CHECK7-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i64, align 8 -// CHECK7-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 -// CHECK7-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[I9:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[J10:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 -// CHECK7-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK7-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK7-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK7-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK7-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK7-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK7-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK7-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK7-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK7-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK7-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 -// CHECK7-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK7-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64 -// CHECK7-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK7-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 -// CHECK7-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 -// CHECK7-NEXT: [[CONV6:%.*]] = sext i32 [[DIV5]] to i64 -// CHECK7-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV6]] -// CHECK7-NEXT: [[SUB7:%.*]] = sub nsw i64 [[MUL]], 1 -// CHECK7-NEXT: store i64 [[SUB7]], i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK7-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK7-NEXT: store i32 0, i32* [[J]], align 4 -// CHECK7-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK7-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] -// CHECK7-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK7: land.lhs.true: -// CHECK7-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK7-NEXT: [[CMP8:%.*]] = icmp slt i32 0, [[TMP6]] -// CHECK7-NEXT: br i1 [[CMP8]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] -// CHECK7: omp.precond.then: -// CHECK7-NEXT: store i64 0, i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK7-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK7-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK7-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK7-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK7-NEXT: [[CONV11:%.*]] = zext i32 [[NVPTX_NUM_THREADS]] to i64 -// CHECK7-NEXT: [[TMP8:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK7-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4 -// CHECK7-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB1]], i32 [[TMP9]], i32 91, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_COMB_LB]], i64* [[DOTOMP_COMB_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 [[CONV11]]) -// CHECK7-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK7-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK7-NEXT: [[CMP12:%.*]] = icmp sgt i64 [[TMP10]], [[TMP11]] -// CHECK7-NEXT: br i1 [[CMP12]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK7: cond.true: -// CHECK7-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK7-NEXT: br label [[COND_END:%.*]] -// CHECK7: cond.false: -// CHECK7-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK7-NEXT: br label [[COND_END]] -// CHECK7: cond.end: -// CHECK7-NEXT: [[COND:%.*]] = phi i64 [ [[TMP12]], [[COND_TRUE]] ], [ [[TMP13]], [[COND_FALSE]] ] -// CHECK7-NEXT: store i64 [[COND]], i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK7-NEXT: [[TMP14:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK7-NEXT: store i64 [[TMP14]], i64* [[DOTOMP_IV]], align 8 -// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK7: omp.inner.for.cond: -// CHECK7-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK7-NEXT: [[TMP16:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK7-NEXT: [[ADD:%.*]] = add nsw i64 [[TMP16]], 1 -// CHECK7-NEXT: [[CMP13:%.*]] = icmp slt i64 [[TMP15]], [[ADD]] -// CHECK7-NEXT: br i1 [[CMP13]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK7: omp.inner.for.body: -// CHECK7-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK7-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32 -// CHECK7-NEXT: [[TMP19:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK7-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32 -// CHECK7-NEXT: [[TMP21:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK7-NEXT: store i32 [[TMP21]], i32* [[N_CASTED]], align 4 -// CHECK7-NEXT: [[TMP22:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK7-NEXT: [[TMP23:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK7-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP18]] to i8* -// CHECK7-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 -// CHECK7-NEXT: [[TMP25:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK7-NEXT: [[TMP26:%.*]] = inttoptr i32 [[TMP20]] to i8* -// CHECK7-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 -// CHECK7-NEXT: [[TMP27:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK7-NEXT: [[TMP28:%.*]] = inttoptr i32 [[TMP22]] to i8* -// CHECK7-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 -// CHECK7-NEXT: [[TMP29:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK7-NEXT: [[TMP30:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* -// CHECK7-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 4 -// CHECK7-NEXT: [[TMP31:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK7-NEXT: [[TMP32:%.*]] = load i32, i32* [[TMP31]], align 4 -// CHECK7-NEXT: [[TMP33:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK7-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP32]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [10 x [10 x i32]]*)* @__omp_outlined__9 to i8*), i8* null, i8** [[TMP33]], i32 4) -// CHECK7-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK7: omp.inner.for.inc: -// CHECK7-NEXT: [[TMP34:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK7-NEXT: [[TMP35:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK7-NEXT: [[ADD14:%.*]] = add nsw i64 [[TMP34]], [[TMP35]] -// CHECK7-NEXT: store i64 [[ADD14]], i64* [[DOTOMP_IV]], align 8 -// CHECK7-NEXT: [[TMP36:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK7-NEXT: [[TMP37:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK7-NEXT: [[ADD15:%.*]] = add nsw i64 [[TMP36]], [[TMP37]] -// CHECK7-NEXT: store i64 [[ADD15]], i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK7-NEXT: [[TMP38:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK7-NEXT: [[TMP39:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK7-NEXT: [[ADD16:%.*]] = add nsw i64 [[TMP38]], [[TMP39]] -// CHECK7-NEXT: store i64 [[ADD16]], i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK7-NEXT: [[TMP40:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK7-NEXT: [[TMP41:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK7-NEXT: [[CMP17:%.*]] = icmp sgt i64 [[TMP40]], [[TMP41]] -// CHECK7-NEXT: br i1 [[CMP17]], label [[COND_TRUE18:%.*]], label [[COND_FALSE19:%.*]] -// CHECK7: cond.true18: -// CHECK7-NEXT: [[TMP42:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK7-NEXT: br label [[COND_END20:%.*]] -// CHECK7: cond.false19: -// CHECK7-NEXT: [[TMP43:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK7-NEXT: br label [[COND_END20]] -// CHECK7: cond.end20: -// CHECK7-NEXT: [[COND21:%.*]] = phi i64 [ [[TMP42]], [[COND_TRUE18]] ], [ [[TMP43]], [[COND_FALSE19]] ] -// CHECK7-NEXT: store i64 [[COND21]], i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK7-NEXT: [[TMP44:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK7-NEXT: store i64 [[TMP44]], i64* [[DOTOMP_IV]], align 8 -// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK7: omp.inner.for.end: -// CHECK7-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK7: omp.loop.exit: -// CHECK7-NEXT: [[TMP45:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK7-NEXT: [[TMP46:%.*]] = load i32, i32* [[TMP45]], align 4 -// CHECK7-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP46]]) -// CHECK7-NEXT: br label [[OMP_PRECOND_END]] -// CHECK7: omp.precond.end: -// CHECK7-NEXT: ret void -// -// -// CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__9 -// CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { -// CHECK7-NEXT: entry: -// CHECK7-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK7-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK7-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK7-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 -// CHECK7-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 -// CHECK7-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTOMP_LB:%.*]] = alloca i64, align 8 -// CHECK7-NEXT: [[DOTOMP_UB:%.*]] = alloca i64, align 8 -// CHECK7-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 -// CHECK7-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[I11:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[J12:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK7-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK7-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK7-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK7-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK7-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK7-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK7-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK7-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK7-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK7-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK7-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK7-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 -// CHECK7-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK7-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64 -// CHECK7-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK7-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 -// CHECK7-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 -// CHECK7-NEXT: [[CONV6:%.*]] = sext i32 [[DIV5]] to i64 -// CHECK7-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV6]] -// CHECK7-NEXT: [[SUB7:%.*]] = sub nsw i64 [[MUL]], 1 -// CHECK7-NEXT: store i64 [[SUB7]], i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK7-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK7-NEXT: store i32 0, i32* [[J]], align 4 -// CHECK7-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK7-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] -// CHECK7-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK7: land.lhs.true: -// CHECK7-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK7-NEXT: [[CMP8:%.*]] = icmp slt i32 0, [[TMP6]] -// CHECK7-NEXT: br i1 [[CMP8]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] -// CHECK7: omp.precond.then: -// CHECK7-NEXT: store i64 0, i64* [[DOTOMP_LB]], align 8 -// CHECK7-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK7-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_UB]], align 8 -// CHECK7-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK7-NEXT: [[CONV9:%.*]] = zext i32 [[TMP8]] to i64 -// CHECK7-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK7-NEXT: [[CONV10:%.*]] = zext i32 [[TMP9]] to i64 -// CHECK7-NEXT: store i64 [[CONV9]], i64* [[DOTOMP_LB]], align 8 -// CHECK7-NEXT: store i64 [[CONV10]], i64* [[DOTOMP_UB]], align 8 -// CHECK7-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK7-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK7-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 -// CHECK7-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB2]], i32 [[TMP11]], i32 33, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_LB]], i64* [[DOTOMP_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 1) -// CHECK7-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTOMP_LB]], align 8 -// CHECK7-NEXT: store i64 [[TMP12]], i64* [[DOTOMP_IV]], align 8 -// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK7: omp.inner.for.cond: -// CHECK7-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK7-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK7-NEXT: [[CONV13:%.*]] = zext i32 [[TMP14]] to i64 -// CHECK7-NEXT: [[CMP14:%.*]] = icmp sle i64 [[TMP13]], [[CONV13]] -// CHECK7-NEXT: br i1 [[CMP14]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK7: omp.inner.for.body: -// CHECK7-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK7-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK7-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP16]], 0 -// CHECK7-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1 -// CHECK7-NEXT: [[MUL17:%.*]] = mul nsw i32 1, [[DIV16]] -// CHECK7-NEXT: [[CONV18:%.*]] = sext i32 [[MUL17]] to i64 -// CHECK7-NEXT: [[DIV19:%.*]] = sdiv i64 [[TMP15]], [[CONV18]] -// CHECK7-NEXT: [[MUL20:%.*]] = mul nsw i64 [[DIV19]], 1 -// CHECK7-NEXT: [[ADD:%.*]] = add nsw i64 0, [[MUL20]] -// CHECK7-NEXT: [[CONV21:%.*]] = trunc i64 [[ADD]] to i32 -// CHECK7-NEXT: store i32 [[CONV21]], i32* [[I11]], align 4 -// CHECK7-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK7-NEXT: [[TMP18:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK7-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK7-NEXT: [[SUB22:%.*]] = sub nsw i32 [[TMP19]], 0 -// CHECK7-NEXT: [[DIV23:%.*]] = sdiv i32 [[SUB22]], 1 -// CHECK7-NEXT: [[MUL24:%.*]] = mul nsw i32 1, [[DIV23]] -// CHECK7-NEXT: [[CONV25:%.*]] = sext i32 [[MUL24]] to i64 -// CHECK7-NEXT: [[DIV26:%.*]] = sdiv i64 [[TMP18]], [[CONV25]] -// CHECK7-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK7-NEXT: [[SUB27:%.*]] = sub nsw i32 [[TMP20]], 0 -// CHECK7-NEXT: [[DIV28:%.*]] = sdiv i32 [[SUB27]], 1 -// CHECK7-NEXT: [[MUL29:%.*]] = mul nsw i32 1, [[DIV28]] -// CHECK7-NEXT: [[CONV30:%.*]] = sext i32 [[MUL29]] to i64 -// CHECK7-NEXT: [[MUL31:%.*]] = mul nsw i64 [[DIV26]], [[CONV30]] -// CHECK7-NEXT: [[SUB32:%.*]] = sub nsw i64 [[TMP17]], [[MUL31]] -// CHECK7-NEXT: [[MUL33:%.*]] = mul nsw i64 [[SUB32]], 1 -// CHECK7-NEXT: [[ADD34:%.*]] = add nsw i64 0, [[MUL33]] -// CHECK7-NEXT: [[CONV35:%.*]] = trunc i64 [[ADD34]] to i32 -// CHECK7-NEXT: store i32 [[CONV35]], i32* [[J12]], align 4 -// CHECK7-NEXT: [[TMP21:%.*]] = load i32, i32* [[I11]], align 4 -// CHECK7-NEXT: [[TMP22:%.*]] = load i32, i32* [[J12]], align 4 -// CHECK7-NEXT: [[ADD36:%.*]] = add nsw i32 [[TMP21]], [[TMP22]] -// CHECK7-NEXT: [[TMP23:%.*]] = load i32, i32* [[I11]], align 4 -// CHECK7-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP23]] -// CHECK7-NEXT: [[TMP24:%.*]] = load i32, i32* [[J12]], align 4 -// CHECK7-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP24]] -// CHECK7-NEXT: store i32 [[ADD36]], i32* [[ARRAYIDX37]], align 4 -// CHECK7-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK7: omp.body.continue: -// CHECK7-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK7: omp.inner.for.inc: -// CHECK7-NEXT: [[TMP25:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK7-NEXT: [[TMP26:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK7-NEXT: [[ADD38:%.*]] = add nsw i64 [[TMP25]], [[TMP26]] -// CHECK7-NEXT: store i64 [[ADD38]], i64* [[DOTOMP_IV]], align 8 -// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK7: omp.inner.for.end: -// CHECK7-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK7: omp.loop.exit: -// CHECK7-NEXT: [[TMP27:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK7-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP27]], align 4 -// CHECK7-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP28]]) -// CHECK7-NEXT: br label [[OMP_PRECOND_END]] -// CHECK7: omp.precond.end: -// CHECK7-NEXT: ret void -// -// -// CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74 -// CHECK7-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { -// CHECK7-NEXT: entry: -// CHECK7-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK7-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 -// CHECK7-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK7-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK7-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK7-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 -// CHECK7-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK7-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK7-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK7-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK7: .execute: -// CHECK7-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK7-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK7-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 -// CHECK7-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK7-NEXT: [[TMP4:%.*]] = load i32*, i32** [[V_ADDR]], align 4 -// CHECK7-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK7-NEXT: call void @__omp_outlined__10(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32* [[TMP4]]) #[[ATTR2]] -// CHECK7-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK7: .omp.deinit: -// CHECK7-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK7-NEXT: br label [[DOTEXIT:%.*]] -// CHECK7: .exit: -// CHECK7-NEXT: ret void -// -// -// CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__10 -// CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { -// CHECK7-NEXT: entry: -// CHECK7-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK7-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK7-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK7-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 -// CHECK7-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 -// CHECK7-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK7-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK7-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK7-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK7-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 -// CHECK7-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK7-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK7-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK7-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK7-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK7-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK7-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK7-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK7-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK7-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK7-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK7-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK7: omp.precond.then: -// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK7-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK7-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK7-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK7-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK7-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK7-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 -// CHECK7-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK7-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK7-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK7-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] -// CHECK7-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK7: cond.true: -// CHECK7-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK7-NEXT: br label [[COND_END:%.*]] -// CHECK7: cond.false: -// CHECK7-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK7-NEXT: br label [[COND_END]] -// CHECK7: cond.end: -// CHECK7-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] -// CHECK7-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK7-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK7-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 -// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK7: omp.inner.for.cond: -// CHECK7-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK7-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK7-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 -// CHECK7-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] -// CHECK7-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK7: omp.inner.for.body: -// CHECK7-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK7-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK7-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK7-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 -// CHECK7-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK7-NEXT: [[TMP18:%.*]] = load i32*, i32** [[V_ADDR]], align 4 -// CHECK7-NEXT: [[TMP19:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK7-NEXT: [[TMP20:%.*]] = inttoptr i32 [[TMP14]] to i8* -// CHECK7-NEXT: store i8* [[TMP20]], i8** [[TMP19]], align 4 -// CHECK7-NEXT: [[TMP21:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK7-NEXT: [[TMP22:%.*]] = inttoptr i32 [[TMP15]] to i8* -// CHECK7-NEXT: store i8* [[TMP22]], i8** [[TMP21]], align 4 -// CHECK7-NEXT: [[TMP23:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK7-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP17]] to i8* -// CHECK7-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 -// CHECK7-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK7-NEXT: [[TMP26:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* -// CHECK7-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 -// CHECK7-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 -// CHECK7-NEXT: [[TMP28:%.*]] = bitcast i32* [[TMP18]] to i8* -// CHECK7-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 -// CHECK7-NEXT: [[TMP29:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK7-NEXT: [[TMP30:%.*]] = load i32, i32* [[TMP29]], align 4 -// CHECK7-NEXT: [[TMP31:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK7-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP30]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32*)* @__omp_outlined__11 to i8*), i8* null, i8** [[TMP31]], i32 5) -// CHECK7-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK7: omp.inner.for.inc: -// CHECK7-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK7-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK7-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP32]], [[TMP33]] -// CHECK7-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 -// CHECK7-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK7-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK7-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP34]], [[TMP35]] -// CHECK7-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK7-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK7-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK7-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP36]], [[TMP37]] -// CHECK7-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK7-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK7-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK7-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP38]], [[TMP39]] -// CHECK7-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] -// CHECK7: cond.true10: -// CHECK7-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK7-NEXT: br label [[COND_END12:%.*]] -// CHECK7: cond.false11: -// CHECK7-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK7-NEXT: br label [[COND_END12]] -// CHECK7: cond.end12: -// CHECK7-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP40]], [[COND_TRUE10]] ], [ [[TMP41]], [[COND_FALSE11]] ] -// CHECK7-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK7-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK7-NEXT: store i32 [[TMP42]], i32* [[DOTOMP_IV]], align 4 -// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK7: omp.inner.for.end: -// CHECK7-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK7: omp.loop.exit: -// CHECK7-NEXT: [[TMP43:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK7-NEXT: [[TMP44:%.*]] = load i32, i32* [[TMP43]], align 4 -// CHECK7-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP44]]) -// CHECK7-NEXT: br label [[OMP_PRECOND_END]] -// CHECK7: omp.precond.end: -// CHECK7-NEXT: ret void -// -// -// CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__11 -// CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { -// CHECK7-NEXT: entry: -// CHECK7-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK7-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK7-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK7-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 -// CHECK7-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK7-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK7-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK7-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK7-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK7-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK7-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 -// CHECK7-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK7-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK7-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK7-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK7-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK7-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK7-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK7-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK7-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK7-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK7-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK7-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK7: omp.precond.then: -// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK7-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK7-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK7-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK7-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK7-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 -// CHECK7-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 -// CHECK7-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK7-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK7-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK7-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK7-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK7-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 -// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK7: omp.inner.for.cond: -// CHECK7-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK7-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK7-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] -// CHECK7-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK7: omp.inner.for.body: -// CHECK7-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK7-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 -// CHECK7-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK7-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 -// CHECK7-NEXT: [[TMP13:%.*]] = load i32*, i32** [[V_ADDR]], align 4 -// CHECK7-NEXT: [[TMP14:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK7-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TMP13]], i32 [[TMP14]] -// CHECK7-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -// CHECK7-NEXT: [[TMP16:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK7-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP16]] -// CHECK7-NEXT: store i32 [[TMP15]], i32* [[ARRAYIDX5]], align 4 -// CHECK7-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK7: omp.body.continue: -// CHECK7-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK7: omp.inner.for.inc: -// CHECK7-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK7-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK7-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] -// CHECK7-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 -// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK7: omp.inner.for.end: -// CHECK7-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK7: omp.loop.exit: -// CHECK7-NEXT: [[TMP19:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK7-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4 -// CHECK7-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP20]]) -// CHECK7-NEXT: br label [[OMP_PRECOND_END]] -// CHECK7: omp.precond.end: -// CHECK7-NEXT: ret void -// -// -// CHECK8-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43 -// CHECK8-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0:[0-9]+]] { -// CHECK8-NEXT: entry: -// CHECK8-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK8-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 -// CHECK8-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 -// CHECK8-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK8-NEXT: [[L_CASTED:%.*]] = alloca i64, align 8 -// CHECK8-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK8-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK8-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 -// CHECK8-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 -// CHECK8-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK8-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 -// CHECK8-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* -// CHECK8-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK8-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK8-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK8: .execute: -// CHECK8-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) -// CHECK8-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK8-NEXT: [[CONV2:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK8-NEXT: store i32 [[TMP2]], i32* [[CONV2]], align 4 -// CHECK8-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK8-NEXT: [[TMP4:%.*]] = load i32, i32* [[CONV1]], align 8 -// CHECK8-NEXT: [[CONV3:%.*]] = bitcast i64* [[L_CASTED]] to i32* -// CHECK8-NEXT: store i32 [[TMP4]], i32* [[CONV3]], align 4 -// CHECK8-NEXT: [[TMP5:%.*]] = load i64, i64* [[L_CASTED]], align 8 -// CHECK8-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK8-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i32]* [[TMP0]], i64 [[TMP5]]) #[[ATTR2:[0-9]+]] -// CHECK8-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK8: .omp.deinit: -// CHECK8-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK8-NEXT: br label [[DOTEXIT:%.*]] -// CHECK8: .exit: -// CHECK8-NEXT: ret void -// -// -// CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { -// CHECK8-NEXT: entry: -// CHECK8-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK8-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK8-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK8-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 -// CHECK8-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 -// CHECK8-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[I5:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK8-NEXT: [[L_CASTED:%.*]] = alloca i64, align 8 -// CHECK8-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 8 -// CHECK8-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK8-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK8-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK8-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 -// CHECK8-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 -// CHECK8-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK8-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 -// CHECK8-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* -// CHECK8-NEXT: [[TMP1:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK8-NEXT: [[TMP2:%.*]] = load i64, i64* @"_openmp_static_kernel$size", align 8 -// CHECK8-NEXT: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i64 [[TMP2]], i16 [[TMP1]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) -// CHECK8-NEXT: [[TMP3:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 8 -// CHECK8-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP3]], i64 0 -// CHECK8-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to %struct._globalized_locals_ty* -// CHECK8-NEXT: [[L2:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP5]], i32 0, i32 0 -// CHECK8-NEXT: [[TMP6:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK8-NEXT: store i32 [[TMP6]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK8-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK8-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP7]], 0 -// CHECK8-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK8-NEXT: [[SUB4:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK8-NEXT: store i32 [[SUB4]], i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK8-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK8-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK8-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP8]] -// CHECK8-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK8: omp.precond.then: -// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK8-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK8-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK8-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK8-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK8-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 -// CHECK8-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP11]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) -// CHECK8-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK8-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK8-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]] -// CHECK8-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK8: cond.true: -// CHECK8-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK8-NEXT: br label [[COND_END:%.*]] -// CHECK8: cond.false: -// CHECK8-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK8-NEXT: br label [[COND_END]] -// CHECK8: cond.end: -// CHECK8-NEXT: [[COND:%.*]] = phi i32 [ [[TMP14]], [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ] -// CHECK8-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK8-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK8-NEXT: store i32 [[TMP16]], i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK8: omp.inner.for.cond: -// CHECK8-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK8-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], 1 -// CHECK8-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP17]], [[ADD]] -// CHECK8-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK8: omp.inner.for.body: -// CHECK8-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK8-NEXT: [[TMP20:%.*]] = zext i32 [[TMP19]] to i64 -// CHECK8-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK8-NEXT: [[TMP22:%.*]] = zext i32 [[TMP21]] to i64 -// CHECK8-NEXT: [[TMP23:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK8-NEXT: [[CONV8:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK8-NEXT: store i32 [[TMP23]], i32* [[CONV8]], align 4 -// CHECK8-NEXT: [[TMP24:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK8-NEXT: [[TMP25:%.*]] = load i32, i32* [[CONV1]], align 8 -// CHECK8-NEXT: [[CONV9:%.*]] = bitcast i64* [[L_CASTED]] to i32* -// CHECK8-NEXT: store i32 [[TMP25]], i32* [[CONV9]], align 4 -// CHECK8-NEXT: [[TMP26:%.*]] = load i64, i64* [[L_CASTED]], align 8 -// CHECK8-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK8-NEXT: [[TMP28:%.*]] = inttoptr i64 [[TMP20]] to i8* -// CHECK8-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 8 -// CHECK8-NEXT: [[TMP29:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK8-NEXT: [[TMP30:%.*]] = inttoptr i64 [[TMP22]] to i8* -// CHECK8-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 8 -// CHECK8-NEXT: [[TMP31:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK8-NEXT: [[TMP32:%.*]] = inttoptr i64 [[TMP24]] to i8* -// CHECK8-NEXT: store i8* [[TMP32]], i8** [[TMP31]], align 8 -// CHECK8-NEXT: [[TMP33:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 -// CHECK8-NEXT: [[TMP34:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* -// CHECK8-NEXT: store i8* [[TMP34]], i8** [[TMP33]], align 8 -// CHECK8-NEXT: [[TMP35:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 4 -// CHECK8-NEXT: [[TMP36:%.*]] = inttoptr i64 [[TMP26]] to i8* -// CHECK8-NEXT: store i8* [[TMP36]], i8** [[TMP35]], align 8 -// CHECK8-NEXT: [[TMP37:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK8-NEXT: [[TMP38:%.*]] = load i32, i32* [[TMP37]], align 4 -// CHECK8-NEXT: [[TMP39:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK8-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP38]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i32]*, i64)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP39]], i64 5) -// CHECK8-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK8: omp.inner.for.inc: -// CHECK8-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK8-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP40]], [[TMP41]] -// CHECK8-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK8-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK8-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP42]], [[TMP43]] -// CHECK8-NEXT: store i32 [[ADD11]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK8-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK8-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK8-NEXT: [[ADD12:%.*]] = add nsw i32 [[TMP44]], [[TMP45]] -// CHECK8-NEXT: store i32 [[ADD12]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK8-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK8-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK8-NEXT: [[CMP13:%.*]] = icmp sgt i32 [[TMP46]], [[TMP47]] -// CHECK8-NEXT: br i1 [[CMP13]], label [[COND_TRUE14:%.*]], label [[COND_FALSE15:%.*]] -// CHECK8: cond.true14: -// CHECK8-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK8-NEXT: br label [[COND_END16:%.*]] -// CHECK8: cond.false15: -// CHECK8-NEXT: [[TMP49:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK8-NEXT: br label [[COND_END16]] -// CHECK8: cond.end16: -// CHECK8-NEXT: [[COND17:%.*]] = phi i32 [ [[TMP48]], [[COND_TRUE14]] ], [ [[TMP49]], [[COND_FALSE15]] ] -// CHECK8-NEXT: store i32 [[COND17]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK8-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK8-NEXT: store i32 [[TMP50]], i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK8: omp.inner.for.end: -// CHECK8-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK8: omp.loop.exit: -// CHECK8-NEXT: [[TMP51:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK8-NEXT: [[TMP52:%.*]] = load i32, i32* [[TMP51]], align 4 -// CHECK8-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP52]]) -// CHECK8-NEXT: [[TMP53:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK8-NEXT: [[TMP54:%.*]] = icmp ne i32 [[TMP53]], 0 -// CHECK8-NEXT: br i1 [[TMP54]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK8: .omp.lastprivate.then: -// CHECK8-NEXT: [[TMP55:%.*]] = load i32, i32* [[CONV1]], align 8 -// CHECK8-NEXT: store i32 [[TMP55]], i32* [[CONV1]], align 8 -// CHECK8-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK8: .omp.lastprivate.done: -// CHECK8-NEXT: br label [[OMP_PRECOND_END]] -// CHECK8: omp.precond.end: -// CHECK8-NEXT: [[TMP56:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK8-NEXT: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[TMP56]]) -// CHECK8-NEXT: ret void -// -// -// CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { -// CHECK8-NEXT: entry: -// CHECK8-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK8-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK8-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 -// CHECK8-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 -// CHECK8-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK8-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 -// CHECK8-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 -// CHECK8-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[I6:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK8-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK8-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK8-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK8-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK8-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 -// CHECK8-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 -// CHECK8-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK8-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 -// CHECK8-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* -// CHECK8-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK8-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK8-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK8-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK8-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK8-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK8-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK8-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK8-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK8-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK8-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK8: omp.precond.then: -// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK8-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK8-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK8-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK8-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP5]] to i32 -// CHECK8-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK8-NEXT: [[CONV5:%.*]] = trunc i64 [[TMP6]] to i32 -// CHECK8-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_LB]], align 4 -// CHECK8-NEXT: store i32 [[CONV5]], i32* [[DOTOMP_UB]], align 4 -// CHECK8-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK8-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK8-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK8-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) -// CHECK8-NEXT: br label [[OMP_DISPATCH_COND:%.*]] -// CHECK8: omp.dispatch.cond: -// CHECK8-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK8-NEXT: [[CONV7:%.*]] = sext i32 [[TMP9]] to i64 -// CHECK8-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK8-NEXT: [[CMP8:%.*]] = icmp ugt i64 [[CONV7]], [[TMP10]] -// CHECK8-NEXT: br i1 [[CMP8]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK8: cond.true: -// CHECK8-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK8-NEXT: br label [[COND_END:%.*]] -// CHECK8: cond.false: -// CHECK8-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK8-NEXT: [[CONV9:%.*]] = sext i32 [[TMP12]] to i64 -// CHECK8-NEXT: br label [[COND_END]] -// CHECK8: cond.end: -// CHECK8-NEXT: [[COND:%.*]] = phi i64 [ [[TMP11]], [[COND_TRUE]] ], [ [[CONV9]], [[COND_FALSE]] ] -// CHECK8-NEXT: [[CONV10:%.*]] = trunc i64 [[COND]] to i32 -// CHECK8-NEXT: store i32 [[CONV10]], i32* [[DOTOMP_UB]], align 4 -// CHECK8-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK8-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK8-NEXT: [[CMP11:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] -// CHECK8-NEXT: br i1 [[CMP11]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] -// CHECK8: omp.dispatch.body: -// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK8: omp.inner.for.cond: -// CHECK8-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK8-NEXT: [[CMP12:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] -// CHECK8-NEXT: br i1 [[CMP12]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK8: omp.inner.for.body: -// CHECK8-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 -// CHECK8-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK8-NEXT: store i32 [[ADD]], i32* [[I6]], align 4 -// CHECK8-NEXT: [[TMP19:%.*]] = load i32, i32* [[I6]], align 4 -// CHECK8-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64 -// CHECK8-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM]] -// CHECK8-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 -// CHECK8-NEXT: [[TMP20:%.*]] = load i32, i32* [[I6]], align 4 -// CHECK8-NEXT: store i32 [[TMP20]], i32* [[CONV1]], align 8 -// CHECK8-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK8: omp.body.continue: -// CHECK8-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK8: omp.inner.for.inc: -// CHECK8-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: [[ADD13:%.*]] = add nsw i32 [[TMP21]], 1 -// CHECK8-NEXT: store i32 [[ADD13]], i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK8: omp.inner.for.end: -// CHECK8-NEXT: br label [[OMP_DISPATCH_INC:%.*]] -// CHECK8: omp.dispatch.inc: -// CHECK8-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK8-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK8-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK8-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_LB]], align 4 -// CHECK8-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK8-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK8-NEXT: [[ADD15:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] -// CHECK8-NEXT: store i32 [[ADD15]], i32* [[DOTOMP_UB]], align 4 -// CHECK8-NEXT: br label [[OMP_DISPATCH_COND]] -// CHECK8: omp.dispatch.end: -// CHECK8-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK8-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 -// CHECK8-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) -// CHECK8-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK8-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 -// CHECK8-NEXT: br i1 [[TMP29]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK8: .omp.lastprivate.then: -// CHECK8-NEXT: [[TMP30:%.*]] = load i32, i32* [[CONV1]], align 8 -// CHECK8-NEXT: store i32 [[TMP30]], i32* [[CONV1]], align 8 -// CHECK8-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK8: .omp.lastprivate.done: -// CHECK8-NEXT: br label [[OMP_PRECOND_END]] -// CHECK8: omp.precond.end: -// CHECK8-NEXT: ret void -// -// -// CHECK8-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 -// CHECK8-SAME: (i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK8-NEXT: entry: -// CHECK8-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK8-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 -// CHECK8-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK8-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK8-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK8-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK8-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK8-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK8-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK8-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK8-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK8: .execute: -// CHECK8-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK8-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK8-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK8-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 -// CHECK8-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK8-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK8-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] -// CHECK8-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK8: .omp.deinit: -// CHECK8-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK8-NEXT: br label [[DOTEXIT:%.*]] -// CHECK8: .exit: -// CHECK8-NEXT: ret void -// -// -// CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__2 -// CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK8-NEXT: entry: -// CHECK8-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK8-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK8-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK8-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 -// CHECK8-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK8-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 -// CHECK8-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK8-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK8-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK8-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK8-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK8-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK8-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK8-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK8-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK8-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK8-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK8-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK8-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK8-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK8-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK8-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK8-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK8: omp.precond.then: -// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK8-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK8-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK8-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK8-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK8-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK8-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 -// CHECK8-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK8-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK8-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK8-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] -// CHECK8-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK8: cond.true: -// CHECK8-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK8-NEXT: br label [[COND_END:%.*]] -// CHECK8: cond.false: -// CHECK8-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK8-NEXT: br label [[COND_END]] -// CHECK8: cond.end: -// CHECK8-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] -// CHECK8-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK8-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK8-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK8: omp.inner.for.cond: -// CHECK8-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK8-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 -// CHECK8-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] -// CHECK8-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK8: omp.inner.for.body: -// CHECK8-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK8-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64 -// CHECK8-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK8-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 -// CHECK8-NEXT: [[TMP18:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK8-NEXT: [[CONV6:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK8-NEXT: store i32 [[TMP18]], i32* [[CONV6]], align 4 -// CHECK8-NEXT: [[TMP19:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK8-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK8-NEXT: [[TMP21:%.*]] = inttoptr i64 [[TMP15]] to i8* -// CHECK8-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 8 -// CHECK8-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK8-NEXT: [[TMP23:%.*]] = inttoptr i64 [[TMP17]] to i8* -// CHECK8-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 8 -// CHECK8-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK8-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP19]] to i8* -// CHECK8-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 8 -// CHECK8-NEXT: [[TMP26:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 -// CHECK8-NEXT: [[TMP27:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* -// CHECK8-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 8 -// CHECK8-NEXT: [[TMP28:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK8-NEXT: [[TMP29:%.*]] = load i32, i32* [[TMP28]], align 4 -// CHECK8-NEXT: [[TMP30:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK8-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP29]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP30]], i64 4) -// CHECK8-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK8: omp.inner.for.inc: -// CHECK8-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK8-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] -// CHECK8-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK8-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK8-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] -// CHECK8-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK8-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK8-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK8-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] -// CHECK8-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK8-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK8-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK8-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP37]], [[TMP38]] -// CHECK8-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] -// CHECK8: cond.true11: -// CHECK8-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK8-NEXT: br label [[COND_END13:%.*]] -// CHECK8: cond.false12: -// CHECK8-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK8-NEXT: br label [[COND_END13]] -// CHECK8: cond.end13: -// CHECK8-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP39]], [[COND_TRUE11]] ], [ [[TMP40]], [[COND_FALSE12]] ] -// CHECK8-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK8-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK8-NEXT: store i32 [[TMP41]], i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK8: omp.inner.for.end: -// CHECK8-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK8: omp.loop.exit: -// CHECK8-NEXT: [[TMP42:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK8-NEXT: [[TMP43:%.*]] = load i32, i32* [[TMP42]], align 4 -// CHECK8-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP43]]) -// CHECK8-NEXT: br label [[OMP_PRECOND_END]] -// CHECK8: omp.precond.end: -// CHECK8-NEXT: ret void -// -// -// CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__3 -// CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK8-NEXT: entry: -// CHECK8-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK8-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK8-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 -// CHECK8-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 -// CHECK8-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK8-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 -// CHECK8-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[I5:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK8-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK8-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK8-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK8-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK8-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK8-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK8-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK8-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK8-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK8-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK8-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK8-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK8-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK8-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK8-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK8-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK8-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK8-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK8: omp.precond.then: -// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK8-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK8-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK8-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK8-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP5]] to i32 -// CHECK8-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK8-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP6]] to i32 -// CHECK8-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_LB]], align 4 -// CHECK8-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_UB]], align 4 -// CHECK8-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK8-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK8-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK8-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK8-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK8-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK8: omp.inner.for.cond: -// CHECK8-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: [[CONV6:%.*]] = sext i32 [[TMP10]] to i64 -// CHECK8-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK8-NEXT: [[CMP7:%.*]] = icmp ule i64 [[CONV6]], [[TMP11]] -// CHECK8-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK8: omp.inner.for.body: -// CHECK8-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 -// CHECK8-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK8-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 -// CHECK8-NEXT: [[TMP13:%.*]] = load i32, i32* [[I5]], align 4 -// CHECK8-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP13]] to i64 -// CHECK8-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i64 0, i64 [[IDXPROM]] -// CHECK8-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 -// CHECK8-NEXT: [[CONV8:%.*]] = sext i16 [[TMP14]] to i32 -// CHECK8-NEXT: [[ADD9:%.*]] = add nsw i32 [[CONV8]], 1 -// CHECK8-NEXT: [[CONV10:%.*]] = trunc i32 [[ADD9]] to i16 -// CHECK8-NEXT: store i16 [[CONV10]], i16* [[ARRAYIDX]], align 2 -// CHECK8-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK8: omp.body.continue: -// CHECK8-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK8: omp.inner.for.inc: -// CHECK8-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK8-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] -// CHECK8-NEXT: store i32 [[ADD11]], i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK8: omp.inner.for.end: -// CHECK8-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK8: omp.loop.exit: -// CHECK8-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK8-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 -// CHECK8-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) -// CHECK8-NEXT: br label [[OMP_PRECOND_END]] -// CHECK8: omp.precond.end: -// CHECK8-NEXT: ret void -// -// -// CHECK8-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 -// CHECK8-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK8-NEXT: entry: -// CHECK8-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 -// CHECK8-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK8-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 -// CHECK8-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 -// CHECK8-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK8-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK8-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK8: .execute: -// CHECK8-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK8-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK8-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] -// CHECK8-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK8: .omp.deinit: -// CHECK8-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK8-NEXT: br label [[DOTEXIT:%.*]] -// CHECK8: .exit: -// CHECK8-NEXT: ret void -// -// -// CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__4 -// CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK8-NEXT: entry: -// CHECK8-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK8-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK8-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 -// CHECK8-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 8 -// CHECK8-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK8-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK8-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 -// CHECK8-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 -// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK8-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK8-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK8-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK8-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK8-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 -// CHECK8-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK8-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK8-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 -// CHECK8-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK8: cond.true: -// CHECK8-NEXT: br label [[COND_END:%.*]] -// CHECK8: cond.false: -// CHECK8-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK8-NEXT: br label [[COND_END]] -// CHECK8: cond.end: -// CHECK8-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] -// CHECK8-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK8-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK8-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK8: omp.inner.for.cond: -// CHECK8-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 -// CHECK8-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK8: omp.inner.for.body: -// CHECK8-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK8-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 -// CHECK8-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK8-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 -// CHECK8-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK8-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP8]] to i8* -// CHECK8-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 8 -// CHECK8-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK8-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP10]] to i8* -// CHECK8-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 -// CHECK8-NEXT: [[TMP15:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK8-NEXT: [[TMP16:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* -// CHECK8-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 -// CHECK8-NEXT: [[TMP17:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK8-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP17]], i64 3) -// CHECK8-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK8: omp.inner.for.inc: -// CHECK8-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK8-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] -// CHECK8-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK8-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK8-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] -// CHECK8-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK8-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK8-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK8-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK8-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK8-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK8-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP24]], 9 -// CHECK8-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] -// CHECK8: cond.true5: -// CHECK8-NEXT: br label [[COND_END7:%.*]] -// CHECK8: cond.false6: -// CHECK8-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK8-NEXT: br label [[COND_END7]] -// CHECK8: cond.end7: -// CHECK8-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP25]], [[COND_FALSE6]] ] -// CHECK8-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK8-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK8-NEXT: store i32 [[TMP26]], i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK8: omp.inner.for.end: -// CHECK8-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK8: omp.loop.exit: -// CHECK8-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) -// CHECK8-NEXT: ret void -// -// -// CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__5 -// CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK8-NEXT: entry: -// CHECK8-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK8-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK8-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 -// CHECK8-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 -// CHECK8-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 -// CHECK8-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK8-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK8-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK8-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK8-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 -// CHECK8-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 -// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK8-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 -// CHECK8-NEXT: [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK8-NEXT: [[CONV:%.*]] = trunc i64 [[TMP1]] to i32 -// CHECK8-NEXT: [[TMP2:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK8-NEXT: [[CONV1:%.*]] = trunc i64 [[TMP2]] to i32 -// CHECK8-NEXT: store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4 -// CHECK8-NEXT: store i32 [[CONV1]], i32* [[DOTOMP_UB]], align 4 -// CHECK8-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK8-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK8-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 -// CHECK8-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK8-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK8-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK8: omp.inner.for.cond: -// CHECK8-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: [[CONV2:%.*]] = sext i32 [[TMP6]] to i64 -// CHECK8-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK8-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV2]], [[TMP7]] -// CHECK8-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK8: omp.inner.for.body: -// CHECK8-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 -// CHECK8-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK8-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK8-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 -// CHECK8-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP9]] to i64 -// CHECK8-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM]] -// CHECK8-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -// CHECK8-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1 -// CHECK8-NEXT: store i32 [[ADD3]], i32* [[ARRAYIDX]], align 4 -// CHECK8-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK8: omp.body.continue: -// CHECK8-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK8: omp.inner.for.inc: -// CHECK8-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK8-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] -// CHECK8-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK8: omp.inner.for.end: -// CHECK8-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK8: omp.loop.exit: -// CHECK8-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) -// CHECK8-NEXT: ret void -// -// -// CHECK8-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59 -// CHECK8-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { -// CHECK8-NEXT: entry: -// CHECK8-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 -// CHECK8-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 -// CHECK8-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8 -// CHECK8-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK8-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK8-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 -// CHECK8-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK8-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* -// CHECK8-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK8-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK8-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK8: .execute: -// CHECK8-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK8-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK8-NEXT: [[CONV1:%.*]] = bitcast i64* [[F_CASTED]] to i32* -// CHECK8-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 -// CHECK8-NEXT: [[TMP3:%.*]] = load i64, i64* [[F_CASTED]], align 8 -// CHECK8-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK8-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i64 [[TMP3]]) #[[ATTR2]] -// CHECK8-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK8: .omp.deinit: -// CHECK8-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK8-NEXT: br label [[DOTEXIT:%.*]] -// CHECK8: .exit: -// CHECK8-NEXT: ret void -// -// -// CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__6 -// CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { -// CHECK8-NEXT: entry: -// CHECK8-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK8-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK8-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 -// CHECK8-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 -// CHECK8-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[K:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8 -// CHECK8-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 -// CHECK8-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK8-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK8-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK8-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 -// CHECK8-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK8-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* -// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK8-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK8-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK8-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK8-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK8-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 -// CHECK8-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK8-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK8-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 -// CHECK8-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK8: cond.true: -// CHECK8-NEXT: br label [[COND_END:%.*]] -// CHECK8: cond.false: -// CHECK8-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK8-NEXT: br label [[COND_END]] -// CHECK8: cond.end: -// CHECK8-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] -// CHECK8-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK8-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK8-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK8: omp.inner.for.cond: -// CHECK8-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 -// CHECK8-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK8: omp.inner.for.body: -// CHECK8-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK8-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 -// CHECK8-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK8-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 -// CHECK8-NEXT: [[TMP11:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK8-NEXT: [[CONV3:%.*]] = bitcast i64* [[F_CASTED]] to i32* -// CHECK8-NEXT: store i32 [[TMP11]], i32* [[CONV3]], align 4 -// CHECK8-NEXT: [[TMP12:%.*]] = load i64, i64* [[F_CASTED]], align 8 -// CHECK8-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK8-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP8]] to i8* -// CHECK8-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 -// CHECK8-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK8-NEXT: [[TMP16:%.*]] = inttoptr i64 [[TMP10]] to i8* -// CHECK8-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 -// CHECK8-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK8-NEXT: [[TMP18:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* -// CHECK8-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 8 -// CHECK8-NEXT: [[TMP19:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 -// CHECK8-NEXT: [[TMP20:%.*]] = inttoptr i64 [[TMP12]] to i8* -// CHECK8-NEXT: store i8* [[TMP20]], i8** [[TMP19]], align 8 -// CHECK8-NEXT: [[TMP21:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK8-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, [10 x [10 x i32]]*, i64)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP21]], i64 4) -// CHECK8-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK8: omp.inner.for.inc: -// CHECK8-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK8-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK8-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK8-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK8-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] -// CHECK8-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK8-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK8-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK8-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP26]], [[TMP27]] -// CHECK8-NEXT: store i32 [[ADD5]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK8-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK8-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP28]], 99 -// CHECK8-NEXT: br i1 [[CMP6]], label [[COND_TRUE7:%.*]], label [[COND_FALSE8:%.*]] -// CHECK8: cond.true7: -// CHECK8-NEXT: br label [[COND_END9:%.*]] -// CHECK8: cond.false8: -// CHECK8-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK8-NEXT: br label [[COND_END9]] -// CHECK8: cond.end9: -// CHECK8-NEXT: [[COND10:%.*]] = phi i32 [ 99, [[COND_TRUE7]] ], [ [[TMP29]], [[COND_FALSE8]] ] -// CHECK8-NEXT: store i32 [[COND10]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK8-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK8-NEXT: store i32 [[TMP30]], i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK8: omp.inner.for.end: -// CHECK8-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK8: omp.loop.exit: -// CHECK8-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) -// CHECK8-NEXT: ret void -// -// -// CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__7 -// CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { -// CHECK8-NEXT: entry: -// CHECK8-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK8-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK8-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 -// CHECK8-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 -// CHECK8-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 -// CHECK8-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 -// CHECK8-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[K:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK8-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK8-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK8-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK8-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK8-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 -// CHECK8-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK8-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* -// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK8-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 -// CHECK8-NEXT: [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK8-NEXT: [[CONV2:%.*]] = trunc i64 [[TMP1]] to i32 -// CHECK8-NEXT: [[TMP2:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK8-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP2]] to i32 -// CHECK8-NEXT: store i32 [[CONV2]], i32* [[DOTOMP_LB]], align 4 -// CHECK8-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_UB]], align 4 -// CHECK8-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK8-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK8-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 -// CHECK8-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK8-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK8-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK8: omp.inner.for.cond: -// CHECK8-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: [[CONV4:%.*]] = sext i32 [[TMP6]] to i64 -// CHECK8-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK8-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV4]], [[TMP7]] -// CHECK8-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK8: omp.inner.for.body: -// CHECK8-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 -// CHECK8-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 -// CHECK8-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK8-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK8-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: [[DIV5:%.*]] = sdiv i32 [[TMP10]], 10 -// CHECK8-NEXT: [[MUL6:%.*]] = mul nsw i32 [[DIV5]], 10 -// CHECK8-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL6]] -// CHECK8-NEXT: [[MUL7:%.*]] = mul nsw i32 [[SUB]], 1 -// CHECK8-NEXT: [[ADD8:%.*]] = add nsw i32 0, [[MUL7]] -// CHECK8-NEXT: store i32 [[ADD8]], i32* [[J]], align 4 -// CHECK8-NEXT: store i32 10, i32* [[K]], align 4 -// CHECK8-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 -// CHECK8-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 -// CHECK8-NEXT: [[TMP13:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK8-NEXT: [[MUL9:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] -// CHECK8-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP11]], [[MUL9]] -// CHECK8-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 -// CHECK8-NEXT: [[ADD11:%.*]] = add nsw i32 [[ADD10]], [[TMP14]] -// CHECK8-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 -// CHECK8-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP15]] to i64 -// CHECK8-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i64 0, i64 [[IDXPROM]] -// CHECK8-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 -// CHECK8-NEXT: [[IDXPROM12:%.*]] = sext i32 [[TMP16]] to i64 -// CHECK8-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i64 0, i64 [[IDXPROM12]] -// CHECK8-NEXT: store i32 [[ADD11]], i32* [[ARRAYIDX13]], align 4 -// CHECK8-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK8: omp.body.continue: -// CHECK8-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK8: omp.inner.for.inc: -// CHECK8-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK8-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] -// CHECK8-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK8: omp.inner.for.end: -// CHECK8-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK8: omp.loop.exit: -// CHECK8-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) -// CHECK8-NEXT: ret void -// -// -// CHECK8-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l67 -// CHECK8-SAME: (i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { -// CHECK8-NEXT: entry: -// CHECK8-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK8-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 -// CHECK8-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK8-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK8-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK8-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK8-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK8-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK8-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK8-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK8-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK8: .execute: -// CHECK8-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK8-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK8-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK8-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 -// CHECK8-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK8-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK8-NEXT: call void @__omp_outlined__8(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [10 x [10 x i32]]* [[TMP0]]) #[[ATTR2]] -// CHECK8-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK8: .omp.deinit: -// CHECK8-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK8-NEXT: br label [[DOTEXIT:%.*]] -// CHECK8: .exit: -// CHECK8-NEXT: ret void -// -// -// CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__8 -// CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { -// CHECK8-NEXT: entry: -// CHECK8-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK8-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK8-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK8-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 -// CHECK8-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 -// CHECK8-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 -// CHECK8-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i64, align 8 -// CHECK8-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i64, align 8 -// CHECK8-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 -// CHECK8-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[I10:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[J11:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK8-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 -// CHECK8-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK8-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK8-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK8-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK8-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK8-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK8-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK8-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK8-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK8-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK8-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK8-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 -// CHECK8-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK8-NEXT: [[CONV4:%.*]] = sext i32 [[DIV]] to i64 -// CHECK8-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK8-NEXT: [[SUB5:%.*]] = sub nsw i32 [[TMP4]], 0 -// CHECK8-NEXT: [[DIV6:%.*]] = sdiv i32 [[SUB5]], 1 -// CHECK8-NEXT: [[CONV7:%.*]] = sext i32 [[DIV6]] to i64 -// CHECK8-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV4]], [[CONV7]] -// CHECK8-NEXT: [[SUB8:%.*]] = sub nsw i64 [[MUL]], 1 -// CHECK8-NEXT: store i64 [[SUB8]], i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK8-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK8-NEXT: store i32 0, i32* [[J]], align 4 -// CHECK8-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK8-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] -// CHECK8-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK8: land.lhs.true: -// CHECK8-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK8-NEXT: [[CMP9:%.*]] = icmp slt i32 0, [[TMP6]] -// CHECK8-NEXT: br i1 [[CMP9]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] -// CHECK8: omp.precond.then: -// CHECK8-NEXT: store i64 0, i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK8-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK8-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK8-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK8-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK8-NEXT: [[CONV12:%.*]] = zext i32 [[NVPTX_NUM_THREADS]] to i64 -// CHECK8-NEXT: [[TMP8:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK8-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4 -// CHECK8-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB1]], i32 [[TMP9]], i32 91, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_COMB_LB]], i64* [[DOTOMP_COMB_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 [[CONV12]]) -// CHECK8-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK8-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK8-NEXT: [[CMP13:%.*]] = icmp sgt i64 [[TMP10]], [[TMP11]] -// CHECK8-NEXT: br i1 [[CMP13]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK8: cond.true: -// CHECK8-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK8-NEXT: br label [[COND_END:%.*]] -// CHECK8: cond.false: -// CHECK8-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK8-NEXT: br label [[COND_END]] -// CHECK8: cond.end: -// CHECK8-NEXT: [[COND:%.*]] = phi i64 [ [[TMP12]], [[COND_TRUE]] ], [ [[TMP13]], [[COND_FALSE]] ] -// CHECK8-NEXT: store i64 [[COND]], i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK8-NEXT: [[TMP14:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK8-NEXT: store i64 [[TMP14]], i64* [[DOTOMP_IV]], align 8 -// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK8: omp.inner.for.cond: -// CHECK8-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK8-NEXT: [[TMP16:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK8-NEXT: [[ADD:%.*]] = add nsw i64 [[TMP16]], 1 -// CHECK8-NEXT: [[CMP14:%.*]] = icmp slt i64 [[TMP15]], [[ADD]] -// CHECK8-NEXT: br i1 [[CMP14]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK8: omp.inner.for.body: -// CHECK8-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK8-NEXT: [[TMP18:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK8-NEXT: [[TMP19:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK8-NEXT: [[CONV15:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK8-NEXT: store i32 [[TMP19]], i32* [[CONV15]], align 4 -// CHECK8-NEXT: [[TMP20:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK8-NEXT: [[TMP21:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK8-NEXT: [[TMP22:%.*]] = inttoptr i64 [[TMP17]] to i8* -// CHECK8-NEXT: store i8* [[TMP22]], i8** [[TMP21]], align 8 -// CHECK8-NEXT: [[TMP23:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK8-NEXT: [[TMP24:%.*]] = inttoptr i64 [[TMP18]] to i8* -// CHECK8-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 8 -// CHECK8-NEXT: [[TMP25:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK8-NEXT: [[TMP26:%.*]] = inttoptr i64 [[TMP20]] to i8* -// CHECK8-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 8 -// CHECK8-NEXT: [[TMP27:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 -// CHECK8-NEXT: [[TMP28:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* -// CHECK8-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 8 -// CHECK8-NEXT: [[TMP29:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK8-NEXT: [[TMP30:%.*]] = load i32, i32* [[TMP29]], align 4 -// CHECK8-NEXT: [[TMP31:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK8-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP30]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [10 x [10 x i32]]*)* @__omp_outlined__9 to i8*), i8* null, i8** [[TMP31]], i64 4) -// CHECK8-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK8: omp.inner.for.inc: -// CHECK8-NEXT: [[TMP32:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK8-NEXT: [[TMP33:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK8-NEXT: [[ADD16:%.*]] = add nsw i64 [[TMP32]], [[TMP33]] -// CHECK8-NEXT: store i64 [[ADD16]], i64* [[DOTOMP_IV]], align 8 -// CHECK8-NEXT: [[TMP34:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK8-NEXT: [[TMP35:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK8-NEXT: [[ADD17:%.*]] = add nsw i64 [[TMP34]], [[TMP35]] -// CHECK8-NEXT: store i64 [[ADD17]], i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK8-NEXT: [[TMP36:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK8-NEXT: [[TMP37:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK8-NEXT: [[ADD18:%.*]] = add nsw i64 [[TMP36]], [[TMP37]] -// CHECK8-NEXT: store i64 [[ADD18]], i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK8-NEXT: [[TMP38:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK8-NEXT: [[TMP39:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK8-NEXT: [[CMP19:%.*]] = icmp sgt i64 [[TMP38]], [[TMP39]] -// CHECK8-NEXT: br i1 [[CMP19]], label [[COND_TRUE20:%.*]], label [[COND_FALSE21:%.*]] -// CHECK8: cond.true20: -// CHECK8-NEXT: [[TMP40:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK8-NEXT: br label [[COND_END22:%.*]] -// CHECK8: cond.false21: -// CHECK8-NEXT: [[TMP41:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK8-NEXT: br label [[COND_END22]] -// CHECK8: cond.end22: -// CHECK8-NEXT: [[COND23:%.*]] = phi i64 [ [[TMP40]], [[COND_TRUE20]] ], [ [[TMP41]], [[COND_FALSE21]] ] -// CHECK8-NEXT: store i64 [[COND23]], i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK8-NEXT: [[TMP42:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK8-NEXT: store i64 [[TMP42]], i64* [[DOTOMP_IV]], align 8 -// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK8: omp.inner.for.end: -// CHECK8-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK8: omp.loop.exit: -// CHECK8-NEXT: [[TMP43:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK8-NEXT: [[TMP44:%.*]] = load i32, i32* [[TMP43]], align 4 -// CHECK8-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP44]]) -// CHECK8-NEXT: br label [[OMP_PRECOND_END]] -// CHECK8: omp.precond.end: -// CHECK8-NEXT: ret void -// -// -// CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__9 -// CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { -// CHECK8-NEXT: entry: -// CHECK8-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK8-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK8-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 -// CHECK8-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 -// CHECK8-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK8-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 -// CHECK8-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 -// CHECK8-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 -// CHECK8-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTOMP_LB:%.*]] = alloca i64, align 8 -// CHECK8-NEXT: [[DOTOMP_UB:%.*]] = alloca i64, align 8 -// CHECK8-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 -// CHECK8-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[I10:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[J11:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK8-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK8-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK8-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK8-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK8-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK8-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK8-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK8-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK8-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK8-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK8-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK8-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK8-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 -// CHECK8-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK8-NEXT: [[CONV4:%.*]] = sext i32 [[DIV]] to i64 -// CHECK8-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK8-NEXT: [[SUB5:%.*]] = sub nsw i32 [[TMP4]], 0 -// CHECK8-NEXT: [[DIV6:%.*]] = sdiv i32 [[SUB5]], 1 -// CHECK8-NEXT: [[CONV7:%.*]] = sext i32 [[DIV6]] to i64 -// CHECK8-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV4]], [[CONV7]] -// CHECK8-NEXT: [[SUB8:%.*]] = sub nsw i64 [[MUL]], 1 -// CHECK8-NEXT: store i64 [[SUB8]], i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK8-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK8-NEXT: store i32 0, i32* [[J]], align 4 -// CHECK8-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK8-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] -// CHECK8-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK8: land.lhs.true: -// CHECK8-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK8-NEXT: [[CMP9:%.*]] = icmp slt i32 0, [[TMP6]] -// CHECK8-NEXT: br i1 [[CMP9]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] -// CHECK8: omp.precond.then: -// CHECK8-NEXT: store i64 0, i64* [[DOTOMP_LB]], align 8 -// CHECK8-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK8-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_UB]], align 8 -// CHECK8-NEXT: [[TMP8:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK8-NEXT: [[TMP9:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK8-NEXT: store i64 [[TMP8]], i64* [[DOTOMP_LB]], align 8 -// CHECK8-NEXT: store i64 [[TMP9]], i64* [[DOTOMP_UB]], align 8 -// CHECK8-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK8-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK8-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 -// CHECK8-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB2]], i32 [[TMP11]], i32 33, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_LB]], i64* [[DOTOMP_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 1) -// CHECK8-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTOMP_LB]], align 8 -// CHECK8-NEXT: store i64 [[TMP12]], i64* [[DOTOMP_IV]], align 8 -// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK8: omp.inner.for.cond: -// CHECK8-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK8-NEXT: [[TMP14:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK8-NEXT: [[CMP12:%.*]] = icmp ule i64 [[TMP13]], [[TMP14]] -// CHECK8-NEXT: br i1 [[CMP12]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK8: omp.inner.for.body: -// CHECK8-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK8-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK8-NEXT: [[SUB13:%.*]] = sub nsw i32 [[TMP16]], 0 -// CHECK8-NEXT: [[DIV14:%.*]] = sdiv i32 [[SUB13]], 1 -// CHECK8-NEXT: [[MUL15:%.*]] = mul nsw i32 1, [[DIV14]] -// CHECK8-NEXT: [[CONV16:%.*]] = sext i32 [[MUL15]] to i64 -// CHECK8-NEXT: [[DIV17:%.*]] = sdiv i64 [[TMP15]], [[CONV16]] -// CHECK8-NEXT: [[MUL18:%.*]] = mul nsw i64 [[DIV17]], 1 -// CHECK8-NEXT: [[ADD:%.*]] = add nsw i64 0, [[MUL18]] -// CHECK8-NEXT: [[CONV19:%.*]] = trunc i64 [[ADD]] to i32 -// CHECK8-NEXT: store i32 [[CONV19]], i32* [[I10]], align 4 -// CHECK8-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK8-NEXT: [[TMP18:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK8-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK8-NEXT: [[SUB20:%.*]] = sub nsw i32 [[TMP19]], 0 -// CHECK8-NEXT: [[DIV21:%.*]] = sdiv i32 [[SUB20]], 1 -// CHECK8-NEXT: [[MUL22:%.*]] = mul nsw i32 1, [[DIV21]] -// CHECK8-NEXT: [[CONV23:%.*]] = sext i32 [[MUL22]] to i64 -// CHECK8-NEXT: [[DIV24:%.*]] = sdiv i64 [[TMP18]], [[CONV23]] -// CHECK8-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK8-NEXT: [[SUB25:%.*]] = sub nsw i32 [[TMP20]], 0 -// CHECK8-NEXT: [[DIV26:%.*]] = sdiv i32 [[SUB25]], 1 -// CHECK8-NEXT: [[MUL27:%.*]] = mul nsw i32 1, [[DIV26]] -// CHECK8-NEXT: [[CONV28:%.*]] = sext i32 [[MUL27]] to i64 -// CHECK8-NEXT: [[MUL29:%.*]] = mul nsw i64 [[DIV24]], [[CONV28]] -// CHECK8-NEXT: [[SUB30:%.*]] = sub nsw i64 [[TMP17]], [[MUL29]] -// CHECK8-NEXT: [[MUL31:%.*]] = mul nsw i64 [[SUB30]], 1 -// CHECK8-NEXT: [[ADD32:%.*]] = add nsw i64 0, [[MUL31]] -// CHECK8-NEXT: [[CONV33:%.*]] = trunc i64 [[ADD32]] to i32 -// CHECK8-NEXT: store i32 [[CONV33]], i32* [[J11]], align 4 -// CHECK8-NEXT: [[TMP21:%.*]] = load i32, i32* [[I10]], align 4 -// CHECK8-NEXT: [[TMP22:%.*]] = load i32, i32* [[J11]], align 4 -// CHECK8-NEXT: [[ADD34:%.*]] = add nsw i32 [[TMP21]], [[TMP22]] -// CHECK8-NEXT: [[TMP23:%.*]] = load i32, i32* [[I10]], align 4 -// CHECK8-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP23]] to i64 -// CHECK8-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i64 0, i64 [[IDXPROM]] -// CHECK8-NEXT: [[TMP24:%.*]] = load i32, i32* [[J11]], align 4 -// CHECK8-NEXT: [[IDXPROM35:%.*]] = sext i32 [[TMP24]] to i64 -// CHECK8-NEXT: [[ARRAYIDX36:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i64 0, i64 [[IDXPROM35]] -// CHECK8-NEXT: store i32 [[ADD34]], i32* [[ARRAYIDX36]], align 4 -// CHECK8-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK8: omp.body.continue: -// CHECK8-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK8: omp.inner.for.inc: -// CHECK8-NEXT: [[TMP25:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK8-NEXT: [[TMP26:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK8-NEXT: [[ADD37:%.*]] = add nsw i64 [[TMP25]], [[TMP26]] -// CHECK8-NEXT: store i64 [[ADD37]], i64* [[DOTOMP_IV]], align 8 -// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK8: omp.inner.for.end: -// CHECK8-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK8: omp.loop.exit: -// CHECK8-NEXT: [[TMP27:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK8-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP27]], align 4 -// CHECK8-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP28]]) -// CHECK8-NEXT: br label [[OMP_PRECOND_END]] -// CHECK8: omp.precond.end: -// CHECK8-NEXT: ret void -// -// -// CHECK8-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74 -// CHECK8-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { -// CHECK8-NEXT: entry: -// CHECK8-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK8-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 -// CHECK8-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 8 -// CHECK8-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK8-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK8-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK8-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 -// CHECK8-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 8 -// CHECK8-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK8-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 -// CHECK8-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK8-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK8-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK8: .execute: -// CHECK8-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK8-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK8-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK8-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 -// CHECK8-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK8-NEXT: [[TMP4:%.*]] = load i32*, i32** [[V_ADDR]], align 8 -// CHECK8-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK8-NEXT: call void @__omp_outlined__10(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i32]* [[TMP0]], i32* [[TMP4]]) #[[ATTR2]] -// CHECK8-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK8: .omp.deinit: -// CHECK8-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK8-NEXT: br label [[DOTEXIT:%.*]] -// CHECK8: .exit: -// CHECK8-NEXT: ret void -// -// -// CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__10 -// CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { -// CHECK8-NEXT: entry: -// CHECK8-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK8-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK8-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK8-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 -// CHECK8-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 8 -// CHECK8-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK8-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 8 -// CHECK8-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK8-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK8-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK8-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 -// CHECK8-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 8 -// CHECK8-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK8-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 -// CHECK8-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK8-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK8-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK8-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK8-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK8-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK8-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK8-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK8-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK8-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK8-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK8: omp.precond.then: -// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK8-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK8-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK8-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK8-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK8-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK8-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 -// CHECK8-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK8-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK8-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK8-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] -// CHECK8-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK8: cond.true: -// CHECK8-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK8-NEXT: br label [[COND_END:%.*]] -// CHECK8: cond.false: -// CHECK8-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK8-NEXT: br label [[COND_END]] -// CHECK8: cond.end: -// CHECK8-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] -// CHECK8-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK8-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK8-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK8: omp.inner.for.cond: -// CHECK8-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK8-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 -// CHECK8-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] -// CHECK8-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK8: omp.inner.for.body: -// CHECK8-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK8-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64 -// CHECK8-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK8-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 -// CHECK8-NEXT: [[TMP18:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK8-NEXT: [[CONV6:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK8-NEXT: store i32 [[TMP18]], i32* [[CONV6]], align 4 -// CHECK8-NEXT: [[TMP19:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK8-NEXT: [[TMP20:%.*]] = load i32*, i32** [[V_ADDR]], align 8 -// CHECK8-NEXT: [[TMP21:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK8-NEXT: [[TMP22:%.*]] = inttoptr i64 [[TMP15]] to i8* -// CHECK8-NEXT: store i8* [[TMP22]], i8** [[TMP21]], align 8 -// CHECK8-NEXT: [[TMP23:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK8-NEXT: [[TMP24:%.*]] = inttoptr i64 [[TMP17]] to i8* -// CHECK8-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 8 -// CHECK8-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK8-NEXT: [[TMP26:%.*]] = inttoptr i64 [[TMP19]] to i8* -// CHECK8-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 8 -// CHECK8-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 -// CHECK8-NEXT: [[TMP28:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* -// CHECK8-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 8 -// CHECK8-NEXT: [[TMP29:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 4 -// CHECK8-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP20]] to i8* -// CHECK8-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 8 -// CHECK8-NEXT: [[TMP31:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK8-NEXT: [[TMP32:%.*]] = load i32, i32* [[TMP31]], align 4 -// CHECK8-NEXT: [[TMP33:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK8-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP32]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i32]*, i32*)* @__omp_outlined__11 to i8*), i8* null, i8** [[TMP33]], i64 5) -// CHECK8-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK8: omp.inner.for.inc: -// CHECK8-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK8-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP34]], [[TMP35]] -// CHECK8-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK8-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK8-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP36]], [[TMP37]] -// CHECK8-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK8-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK8-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK8-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP38]], [[TMP39]] -// CHECK8-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK8-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK8-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK8-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP40]], [[TMP41]] -// CHECK8-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] -// CHECK8: cond.true11: -// CHECK8-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK8-NEXT: br label [[COND_END13:%.*]] -// CHECK8: cond.false12: -// CHECK8-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK8-NEXT: br label [[COND_END13]] -// CHECK8: cond.end13: -// CHECK8-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP42]], [[COND_TRUE11]] ], [ [[TMP43]], [[COND_FALSE12]] ] -// CHECK8-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK8-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK8-NEXT: store i32 [[TMP44]], i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK8: omp.inner.for.end: -// CHECK8-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK8: omp.loop.exit: -// CHECK8-NEXT: [[TMP45:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK8-NEXT: [[TMP46:%.*]] = load i32, i32* [[TMP45]], align 4 -// CHECK8-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP46]]) -// CHECK8-NEXT: br label [[OMP_PRECOND_END]] -// CHECK8: omp.precond.end: -// CHECK8-NEXT: ret void -// -// -// CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__11 -// CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { -// CHECK8-NEXT: entry: -// CHECK8-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK8-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK8-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 -// CHECK8-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 -// CHECK8-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK8-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 -// CHECK8-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 8 -// CHECK8-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[I5:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK8-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK8-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK8-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK8-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK8-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 -// CHECK8-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 8 -// CHECK8-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK8-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 -// CHECK8-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK8-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK8-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK8-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK8-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK8-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK8-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK8-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK8-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK8-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK8-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK8: omp.precond.then: -// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK8-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK8-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK8-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK8-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP5]] to i32 -// CHECK8-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK8-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP6]] to i32 -// CHECK8-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_LB]], align 4 -// CHECK8-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_UB]], align 4 -// CHECK8-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK8-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK8-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK8-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK8-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK8-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK8: omp.inner.for.cond: -// CHECK8-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: [[CONV6:%.*]] = sext i32 [[TMP10]] to i64 -// CHECK8-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK8-NEXT: [[CMP7:%.*]] = icmp ule i64 [[CONV6]], [[TMP11]] -// CHECK8-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK8: omp.inner.for.body: -// CHECK8-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 -// CHECK8-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK8-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 -// CHECK8-NEXT: [[TMP13:%.*]] = load i32*, i32** [[V_ADDR]], align 8 -// CHECK8-NEXT: [[TMP14:%.*]] = load i32, i32* [[I5]], align 4 -// CHECK8-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP14]] to i64 -// CHECK8-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TMP13]], i64 [[IDXPROM]] -// CHECK8-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -// CHECK8-NEXT: [[TMP16:%.*]] = load i32, i32* [[I5]], align 4 -// CHECK8-NEXT: [[IDXPROM8:%.*]] = sext i32 [[TMP16]] to i64 -// CHECK8-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM8]] -// CHECK8-NEXT: store i32 [[TMP15]], i32* [[ARRAYIDX9]], align 4 -// CHECK8-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK8: omp.body.continue: -// CHECK8-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK8: omp.inner.for.inc: -// CHECK8-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK8-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] -// CHECK8-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK8: omp.inner.for.end: -// CHECK8-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK8: omp.loop.exit: -// CHECK8-NEXT: [[TMP19:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK8-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4 -// CHECK8-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP20]]) -// CHECK8-NEXT: br label [[OMP_PRECOND_END]] -// CHECK8: omp.precond.end: -// CHECK8-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43 // CHECK9-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK9-NEXT: entry: @@ -14367,8 +1686,6 @@ // CHECK9-NEXT: br label [[DOTEXIT:%.*]] // CHECK9: .exit: // CHECK9-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__ // CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { // CHECK9-NEXT: entry: @@ -14523,8 +1840,6 @@ // CHECK9: omp.precond.end: // CHECK9-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP1]]) // CHECK9-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__1 // CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { // CHECK9-NEXT: entry: @@ -14658,8 +1973,6 @@ // CHECK9-NEXT: br label [[OMP_PRECOND_END]] // CHECK9: omp.precond.end: // CHECK9-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 // CHECK9-SAME: (i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { // CHECK9-NEXT: entry: @@ -14690,8 +2003,6 @@ // CHECK9-NEXT: br label [[DOTEXIT:%.*]] // CHECK9: .exit: // CHECK9-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__2 // CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { // CHECK9-NEXT: entry: @@ -14824,8 +2135,6 @@ // CHECK9-NEXT: br label [[OMP_PRECOND_END]] // CHECK9: omp.precond.end: // CHECK9-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__3 // CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { // CHECK9-NEXT: entry: @@ -14919,8 +2228,6 @@ // CHECK9-NEXT: br label [[OMP_PRECOND_END]] // CHECK9: omp.precond.end: // CHECK9-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 // CHECK9-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { // CHECK9-NEXT: entry: @@ -14943,8 +2250,6 @@ // CHECK9-NEXT: br label [[DOTEXIT:%.*]] // CHECK9: .exit: // CHECK9-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__4 // CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { // CHECK9-NEXT: entry: @@ -15038,8 +2343,6 @@ // CHECK9: omp.loop.exit: // CHECK9-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) // CHECK9-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__5 // CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { // CHECK9-NEXT: entry: @@ -15108,8 +2411,6 @@ // CHECK9: omp.loop.exit: // CHECK9-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) // CHECK9-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59 // CHECK9-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { // CHECK9-NEXT: entry: @@ -15140,8 +2441,6 @@ // CHECK9-NEXT: br label [[DOTEXIT:%.*]] // CHECK9: .exit: // CHECK9-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__6 // CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { // CHECK9-NEXT: entry: @@ -15249,8 +2548,6 @@ // CHECK9: omp.loop.exit: // CHECK9-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) // CHECK9-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__7 // CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { // CHECK9-NEXT: entry: @@ -15343,8 +2640,6 @@ // CHECK9: omp.loop.exit: // CHECK9-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) // CHECK9-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l67 // CHECK9-SAME: (i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { // CHECK9-NEXT: entry: @@ -15375,8 +2670,6 @@ // CHECK9-NEXT: br label [[DOTEXIT:%.*]] // CHECK9: .exit: // CHECK9-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__8 // CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { // CHECK9-NEXT: entry: @@ -15525,8 +2818,6 @@ // CHECK9-NEXT: br label [[OMP_PRECOND_END]] // CHECK9: omp.precond.end: // CHECK9-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__9 // CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { // CHECK9-NEXT: entry: @@ -15662,8 +2953,6 @@ // CHECK9-NEXT: br label [[OMP_PRECOND_END]] // CHECK9: omp.precond.end: // CHECK9-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74 // CHECK9-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { // CHECK9-NEXT: entry: @@ -15697,8 +2986,6 @@ // CHECK9-NEXT: br label [[DOTEXIT:%.*]] // CHECK9: .exit: // CHECK9-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__10 // CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { // CHECK9-NEXT: entry: @@ -15837,8 +3124,6 @@ // CHECK9-NEXT: br label [[OMP_PRECOND_END]] // CHECK9: omp.precond.end: // CHECK9-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__11 // CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { // CHECK9-NEXT: entry: @@ -15935,8 +3220,6 @@ // CHECK9-NEXT: br label [[OMP_PRECOND_END]] // CHECK9: omp.precond.end: // CHECK9-NEXT: ret void -// -// // CHECK10-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43 // CHECK10-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK10-NEXT: entry: @@ -15975,8 +3258,6 @@ // CHECK10-NEXT: br label [[DOTEXIT:%.*]] // CHECK10: .exit: // CHECK10-NEXT: ret void -// -// // CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__ // CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { // CHECK10-NEXT: entry: @@ -16136,8 +3417,6 @@ // CHECK10-NEXT: [[TMP56:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 // CHECK10-NEXT: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[TMP56]]) // CHECK10-NEXT: ret void -// -// // CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__1 // CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { // CHECK10-NEXT: entry: @@ -16271,8 +3550,6 @@ // CHECK10-NEXT: br label [[OMP_PRECOND_END]] // CHECK10: omp.precond.end: // CHECK10-NEXT: ret void -// -// // CHECK10-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 // CHECK10-SAME: (i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { // CHECK10-NEXT: entry: @@ -16303,8 +3580,6 @@ // CHECK10-NEXT: br label [[DOTEXIT:%.*]] // CHECK10: .exit: // CHECK10-NEXT: ret void -// -// // CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__2 // CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { // CHECK10-NEXT: entry: @@ -16437,8 +3712,6 @@ // CHECK10-NEXT: br label [[OMP_PRECOND_END]] // CHECK10: omp.precond.end: // CHECK10-NEXT: ret void -// -// // CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__3 // CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { // CHECK10-NEXT: entry: @@ -16532,8 +3805,6 @@ // CHECK10-NEXT: br label [[OMP_PRECOND_END]] // CHECK10: omp.precond.end: // CHECK10-NEXT: ret void -// -// // CHECK10-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 // CHECK10-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { // CHECK10-NEXT: entry: @@ -16556,8 +3827,6 @@ // CHECK10-NEXT: br label [[DOTEXIT:%.*]] // CHECK10: .exit: // CHECK10-NEXT: ret void -// -// // CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__4 // CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { // CHECK10-NEXT: entry: @@ -16651,8 +3920,6 @@ // CHECK10: omp.loop.exit: // CHECK10-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) // CHECK10-NEXT: ret void -// -// // CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__5 // CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { // CHECK10-NEXT: entry: @@ -16721,8 +3988,6 @@ // CHECK10: omp.loop.exit: // CHECK10-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) // CHECK10-NEXT: ret void -// -// // CHECK10-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59 // CHECK10-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { // CHECK10-NEXT: entry: @@ -16753,8 +4018,6 @@ // CHECK10-NEXT: br label [[DOTEXIT:%.*]] // CHECK10: .exit: // CHECK10-NEXT: ret void -// -// // CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__6 // CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { // CHECK10-NEXT: entry: @@ -16862,8 +4125,6 @@ // CHECK10: omp.loop.exit: // CHECK10-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) // CHECK10-NEXT: ret void -// -// // CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__7 // CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { // CHECK10-NEXT: entry: @@ -16956,8 +4217,6 @@ // CHECK10: omp.loop.exit: // CHECK10-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) // CHECK10-NEXT: ret void -// -// // CHECK10-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l67 // CHECK10-SAME: (i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { // CHECK10-NEXT: entry: @@ -16988,8 +4247,6 @@ // CHECK10-NEXT: br label [[DOTEXIT:%.*]] // CHECK10: .exit: // CHECK10-NEXT: ret void -// -// // CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__8 // CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { // CHECK10-NEXT: entry: @@ -17137,8 +4394,6 @@ // CHECK10-NEXT: br label [[OMP_PRECOND_END]] // CHECK10: omp.precond.end: // CHECK10-NEXT: ret void -// -// // CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__9 // CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { // CHECK10-NEXT: entry: @@ -17270,8 +4525,6 @@ // CHECK10-NEXT: br label [[OMP_PRECOND_END]] // CHECK10: omp.precond.end: // CHECK10-NEXT: ret void -// -// // CHECK10-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74 // CHECK10-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { // CHECK10-NEXT: entry: @@ -17305,8 +4558,6 @@ // CHECK10-NEXT: br label [[DOTEXIT:%.*]] // CHECK10: .exit: // CHECK10-NEXT: ret void -// -// // CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__10 // CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { // CHECK10-NEXT: entry: @@ -17445,8 +4696,6 @@ // CHECK10-NEXT: br label [[OMP_PRECOND_END]] // CHECK10: omp.precond.end: // CHECK10-NEXT: ret void -// -// // CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__11 // CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { // CHECK10-NEXT: entry: @@ -17543,8 +4792,6 @@ // CHECK10-NEXT: br label [[OMP_PRECOND_END]] // CHECK10: omp.precond.end: // CHECK10-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43 // CHECK11-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK11-NEXT: entry: @@ -17583,8 +4830,6 @@ // CHECK11-NEXT: br label [[DOTEXIT:%.*]] // CHECK11: .exit: // CHECK11-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__ // CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { // CHECK11-NEXT: entry: @@ -17739,8 +4984,6 @@ // CHECK11: omp.precond.end: // CHECK11-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP1]]) // CHECK11-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__1 // CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { // CHECK11-NEXT: entry: @@ -17874,8 +5117,6 @@ // CHECK11-NEXT: br label [[OMP_PRECOND_END]] // CHECK11: omp.precond.end: // CHECK11-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 // CHECK11-SAME: (i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { // CHECK11-NEXT: entry: @@ -17906,8 +5147,6 @@ // CHECK11-NEXT: br label [[DOTEXIT:%.*]] // CHECK11: .exit: // CHECK11-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__2 // CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { // CHECK11-NEXT: entry: @@ -18040,8 +5279,6 @@ // CHECK11-NEXT: br label [[OMP_PRECOND_END]] // CHECK11: omp.precond.end: // CHECK11-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__3 // CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { // CHECK11-NEXT: entry: @@ -18135,8 +5372,6 @@ // CHECK11-NEXT: br label [[OMP_PRECOND_END]] // CHECK11: omp.precond.end: // CHECK11-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 // CHECK11-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { // CHECK11-NEXT: entry: @@ -18159,8 +5394,6 @@ // CHECK11-NEXT: br label [[DOTEXIT:%.*]] // CHECK11: .exit: // CHECK11-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__4 // CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { // CHECK11-NEXT: entry: @@ -18254,8 +5487,6 @@ // CHECK11: omp.loop.exit: // CHECK11-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) // CHECK11-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__5 // CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { // CHECK11-NEXT: entry: @@ -18324,8 +5555,6 @@ // CHECK11: omp.loop.exit: // CHECK11-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) // CHECK11-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59 // CHECK11-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { // CHECK11-NEXT: entry: @@ -18356,8 +5585,6 @@ // CHECK11-NEXT: br label [[DOTEXIT:%.*]] // CHECK11: .exit: // CHECK11-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__6 // CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { // CHECK11-NEXT: entry: @@ -18465,8 +5692,6 @@ // CHECK11: omp.loop.exit: // CHECK11-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) // CHECK11-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__7 // CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { // CHECK11-NEXT: entry: @@ -18559,8 +5784,6 @@ // CHECK11: omp.loop.exit: // CHECK11-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) // CHECK11-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l67 // CHECK11-SAME: (i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { // CHECK11-NEXT: entry: @@ -18591,8 +5814,6 @@ // CHECK11-NEXT: br label [[DOTEXIT:%.*]] // CHECK11: .exit: // CHECK11-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__8 // CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { // CHECK11-NEXT: entry: @@ -18740,8 +5961,6 @@ // CHECK11-NEXT: br label [[OMP_PRECOND_END]] // CHECK11: omp.precond.end: // CHECK11-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__9 // CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { // CHECK11-NEXT: entry: @@ -18873,8 +6092,6 @@ // CHECK11-NEXT: br label [[OMP_PRECOND_END]] // CHECK11: omp.precond.end: // CHECK11-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74 // CHECK11-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { // CHECK11-NEXT: entry: @@ -18908,8 +6125,6 @@ // CHECK11-NEXT: br label [[DOTEXIT:%.*]] // CHECK11: .exit: // CHECK11-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__10 // CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { // CHECK11-NEXT: entry: @@ -19048,8 +6263,6 @@ // CHECK11-NEXT: br label [[OMP_PRECOND_END]] // CHECK11: omp.precond.end: // CHECK11-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__11 // CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { // CHECK11-NEXT: entry: @@ -19146,8 +6359,6 @@ // CHECK11-NEXT: br label [[OMP_PRECOND_END]] // CHECK11: omp.precond.end: // CHECK11-NEXT: ret void -// -// // CHECK12-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43 // CHECK12-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK12-NEXT: entry: @@ -19182,8 +6393,6 @@ // CHECK12-NEXT: br label [[DOTEXIT:%.*]] // CHECK12: .exit: // CHECK12-NEXT: ret void -// -// // CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__ // CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { // CHECK12-NEXT: entry: @@ -19337,8 +6546,6 @@ // CHECK12-NEXT: [[TMP54:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 // CHECK12-NEXT: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[TMP54]]) // CHECK12-NEXT: ret void -// -// // CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__1 // CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { // CHECK12-NEXT: entry: @@ -19464,8 +6671,6 @@ // CHECK12-NEXT: br label [[OMP_PRECOND_END]] // CHECK12: omp.precond.end: // CHECK12-NEXT: ret void -// -// // CHECK12-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 // CHECK12-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { // CHECK12-NEXT: entry: @@ -19494,8 +6699,6 @@ // CHECK12-NEXT: br label [[DOTEXIT:%.*]] // CHECK12: .exit: // CHECK12-NEXT: ret void -// -// // CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__2 // CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { // CHECK12-NEXT: entry: @@ -19624,8 +6827,6 @@ // CHECK12-NEXT: br label [[OMP_PRECOND_END]] // CHECK12: omp.precond.end: // CHECK12-NEXT: ret void -// -// // CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__3 // CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { // CHECK12-NEXT: entry: @@ -19714,8 +6915,6 @@ // CHECK12-NEXT: br label [[OMP_PRECOND_END]] // CHECK12: omp.precond.end: // CHECK12-NEXT: ret void -// -// // CHECK12-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 // CHECK12-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { // CHECK12-NEXT: entry: @@ -19738,8 +6937,6 @@ // CHECK12-NEXT: br label [[DOTEXIT:%.*]] // CHECK12: .exit: // CHECK12-NEXT: ret void -// -// // CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__4 // CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { // CHECK12-NEXT: entry: @@ -19831,8 +7028,6 @@ // CHECK12: omp.loop.exit: // CHECK12-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) // CHECK12-NEXT: ret void -// -// // CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__5 // CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { // CHECK12-NEXT: entry: @@ -19897,8 +7092,6 @@ // CHECK12: omp.loop.exit: // CHECK12-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) // CHECK12-NEXT: ret void -// -// // CHECK12-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59 // CHECK12-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { // CHECK12-NEXT: entry: @@ -19927,8 +7120,6 @@ // CHECK12-NEXT: br label [[DOTEXIT:%.*]] // CHECK12: .exit: // CHECK12-NEXT: ret void -// -// // CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__6 // CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { // CHECK12-NEXT: entry: @@ -20032,8 +7223,6 @@ // CHECK12: omp.loop.exit: // CHECK12-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) // CHECK12-NEXT: ret void -// -// // CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__7 // CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { // CHECK12-NEXT: entry: @@ -20120,8 +7309,6 @@ // CHECK12: omp.loop.exit: // CHECK12-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) // CHECK12-NEXT: ret void -// -// // CHECK12-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l67 // CHECK12-SAME: (i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { // CHECK12-NEXT: entry: @@ -20150,8 +7337,6 @@ // CHECK12-NEXT: br label [[DOTEXIT:%.*]] // CHECK12: .exit: // CHECK12-NEXT: ret void -// -// // CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__8 // CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { // CHECK12-NEXT: entry: @@ -20300,8 +7485,6 @@ // CHECK12-NEXT: br label [[OMP_PRECOND_END]] // CHECK12: omp.precond.end: // CHECK12-NEXT: ret void -// -// // CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__9 // CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { // CHECK12-NEXT: entry: @@ -20437,8 +7620,6 @@ // CHECK12-NEXT: br label [[OMP_PRECOND_END]] // CHECK12: omp.precond.end: // CHECK12-NEXT: ret void -// -// // CHECK12-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74 // CHECK12-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { // CHECK12-NEXT: entry: @@ -20470,8 +7651,6 @@ // CHECK12-NEXT: br label [[DOTEXIT:%.*]] // CHECK12: .exit: // CHECK12-NEXT: ret void -// -// // CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__10 // CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { // CHECK12-NEXT: entry: @@ -20606,8 +7785,6 @@ // CHECK12-NEXT: br label [[OMP_PRECOND_END]] // CHECK12: omp.precond.end: // CHECK12-NEXT: ret void -// -// // CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__11 // CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { // CHECK12-NEXT: entry: @@ -20698,338 +7875,1281 @@ // CHECK12-NEXT: br label [[OMP_PRECOND_END]] // CHECK12: omp.precond.end: // CHECK12-NEXT: ret void -// -// // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43 // CHECK13-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK13-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK13-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK13-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK13-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK13-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK13-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK13-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK13-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK13-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK13: .execute: +// CHECK13-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK13-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK13-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK13-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK13-NEXT: [[TMP4:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK13-NEXT: store i32 [[TMP4]], i32* [[L_CASTED]], align 4 +// CHECK13-NEXT: [[TMP5:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK13-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK13-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32 [[TMP5]]) #[[ATTR2:[0-9]+]] +// CHECK13-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK13: .omp.deinit: +// CHECK13-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK13-NEXT: br label [[DOTEXIT:%.*]] +// CHECK13: .exit: +// CHECK13-NEXT: ret void +// CHECK13-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK13-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK13-NEXT: entry: +// CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK13-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK13-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[I4:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 +// CHECK13-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK13-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK13-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK13-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK13-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK13-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK13-NEXT: [[TMP1:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK13-NEXT: [[TMP2:%.*]] = load i32, i32* @"_openmp_static_kernel$size", align 4 +// CHECK13-NEXT: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP2]], i16 [[TMP1]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) +// CHECK13-NEXT: [[TMP3:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4 +// CHECK13-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP3]], i32 0 +// CHECK13-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to %struct._globalized_locals_ty* +// CHECK13-NEXT: [[L1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP5]], i32 0, i32 0 +// CHECK13-NEXT: [[TMP6:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK13-NEXT: store i32 [[TMP6]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK13-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK13-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP7]], 0 +// CHECK13-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK13-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK13-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK13-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK13-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK13-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP8]] +// CHECK13-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK13: omp.precond.then: +// CHECK13-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK13-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK13-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK13-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK13-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK13-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK13-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP11]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) +// CHECK13-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK13-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]] +// CHECK13-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK13: cond.true: +// CHECK13-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK13-NEXT: br label [[COND_END:%.*]] +// CHECK13: cond.false: +// CHECK13-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: br label [[COND_END]] +// CHECK13: cond.end: +// CHECK13-NEXT: [[COND:%.*]] = phi i32 [ [[TMP14]], [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ] +// CHECK13-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK13-NEXT: store i32 [[TMP16]], i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK13: omp.inner.for.cond: +// CHECK13-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK13-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], 1 +// CHECK13-NEXT: [[CMP6:%.*]] = icmp slt i32 [[TMP17]], [[ADD]] +// CHECK13-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK13: omp.inner.for.body: +// CHECK13-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK13-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: [[TMP21:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK13-NEXT: store i32 [[TMP21]], i32* [[N_CASTED]], align 4 +// CHECK13-NEXT: [[TMP22:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK13-NEXT: [[TMP23:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK13-NEXT: store i32 [[TMP23]], i32* [[L_CASTED]], align 4 +// CHECK13-NEXT: [[TMP24:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK13-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK13-NEXT: [[TMP26:%.*]] = inttoptr i32 [[TMP19]] to i8* +// CHECK13-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 +// CHECK13-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK13-NEXT: [[TMP28:%.*]] = inttoptr i32 [[TMP20]] to i8* +// CHECK13-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 +// CHECK13-NEXT: [[TMP29:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK13-NEXT: [[TMP30:%.*]] = inttoptr i32 [[TMP22]] to i8* +// CHECK13-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 4 +// CHECK13-NEXT: [[TMP31:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK13-NEXT: [[TMP32:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK13-NEXT: store i8* [[TMP32]], i8** [[TMP31]], align 4 +// CHECK13-NEXT: [[TMP33:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK13-NEXT: [[TMP34:%.*]] = inttoptr i32 [[TMP24]] to i8* +// CHECK13-NEXT: store i8* [[TMP34]], i8** [[TMP33]], align 4 +// CHECK13-NEXT: [[TMP35:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK13-NEXT: [[TMP36:%.*]] = load i32, i32* [[TMP35]], align 4 +// CHECK13-NEXT: [[TMP37:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK13-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP36]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP37]], i32 5) +// CHECK13-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK13: omp.inner.for.inc: +// CHECK13-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK13-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP38]], [[TMP39]] +// CHECK13-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK13-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK13-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP40]], [[TMP41]] +// CHECK13-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK13-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK13-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP42]], [[TMP43]] +// CHECK13-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK13-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP44]], [[TMP45]] +// CHECK13-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK13: cond.true11: +// CHECK13-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK13-NEXT: br label [[COND_END13:%.*]] +// CHECK13: cond.false12: +// CHECK13-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: br label [[COND_END13]] +// CHECK13: cond.end13: +// CHECK13-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP46]], [[COND_TRUE11]] ], [ [[TMP47]], [[COND_FALSE12]] ] +// CHECK13-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK13-NEXT: store i32 [[TMP48]], i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK13: omp.inner.for.end: +// CHECK13-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK13: omp.loop.exit: +// CHECK13-NEXT: [[TMP49:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK13-NEXT: [[TMP50:%.*]] = load i32, i32* [[TMP49]], align 4 +// CHECK13-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP50]]) +// CHECK13-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK13-NEXT: [[TMP52:%.*]] = icmp ne i32 [[TMP51]], 0 +// CHECK13-NEXT: br i1 [[TMP52]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK13: .omp.lastprivate.then: +// CHECK13-NEXT: [[TMP53:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK13-NEXT: store i32 [[TMP53]], i32* [[L_ADDR]], align 4 +// CHECK13-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK13: .omp.lastprivate.done: +// CHECK13-NEXT: br label [[OMP_PRECOND_END]] +// CHECK13: omp.precond.end: +// CHECK13-NEXT: [[TMP54:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK13-NEXT: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[TMP54]]) +// CHECK13-NEXT: ret void +// CHECK13-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK13-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK13-NEXT: entry: +// CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK13-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK13-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK13-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK13-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK13-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK13-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK13-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK13-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK13-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK13-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK13-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK13-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK13-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK13-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK13-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK13-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK13-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK13-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK13-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK13-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK13: omp.precond.then: +// CHECK13-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK13-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK13-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK13-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK13-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK13-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK13-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK13-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK13-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK13-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK13-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK13-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) +// CHECK13-NEXT: br label [[OMP_DISPATCH_COND:%.*]] +// CHECK13: omp.dispatch.cond: +// CHECK13-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK13-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK13-NEXT: [[CMP4:%.*]] = icmp ugt i32 [[TMP9]], [[TMP10]] +// CHECK13-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK13: cond.true: +// CHECK13-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK13-NEXT: br label [[COND_END:%.*]] +// CHECK13: cond.false: +// CHECK13-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK13-NEXT: br label [[COND_END]] +// CHECK13: cond.end: +// CHECK13-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] +// CHECK13-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 +// CHECK13-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK13-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK13-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] +// CHECK13-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] +// CHECK13: omp.dispatch.body: +// CHECK13-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK13: omp.inner.for.cond: +// CHECK13-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK13-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] +// CHECK13-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK13: omp.inner.for.body: +// CHECK13-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK13-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK13-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK13-NEXT: [[TMP19:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK13-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP19]] +// CHECK13-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 +// CHECK13-NEXT: [[TMP20:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK13-NEXT: store i32 [[TMP20]], i32* [[L_ADDR]], align 4 +// CHECK13-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK13: omp.body.continue: +// CHECK13-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK13: omp.inner.for.inc: +// CHECK13-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP21]], 1 +// CHECK13-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK13: omp.inner.for.end: +// CHECK13-NEXT: br label [[OMP_DISPATCH_INC:%.*]] +// CHECK13: omp.dispatch.inc: +// CHECK13-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK13-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK13-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK13-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_LB]], align 4 +// CHECK13-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK13-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK13-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK13-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_UB]], align 4 +// CHECK13-NEXT: br label [[OMP_DISPATCH_COND]] +// CHECK13: omp.dispatch.end: +// CHECK13-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK13-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK13-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) +// CHECK13-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK13-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 +// CHECK13-NEXT: br i1 [[TMP29]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK13: .omp.lastprivate.then: +// CHECK13-NEXT: [[TMP30:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK13-NEXT: store i32 [[TMP30]], i32* [[L_ADDR]], align 4 +// CHECK13-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK13: .omp.lastprivate.done: +// CHECK13-NEXT: br label [[OMP_PRECOND_END]] +// CHECK13: omp.precond.end: +// CHECK13-NEXT: ret void +// CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 +// CHECK13-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK13-NEXT: entry: +// CHECK13-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK13-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK13-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK13-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK13-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK13-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK13-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK13-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK13: .execute: +// CHECK13-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK13-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK13-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK13-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK13-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK13-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] +// CHECK13-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK13: .omp.deinit: +// CHECK13-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK13-NEXT: br label [[DOTEXIT:%.*]] +// CHECK13: .exit: +// CHECK13-NEXT: ret void +// CHECK13-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK13-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK13-NEXT: entry: +// CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK13-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK13-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK13-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK13-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK13-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK13-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK13-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK13-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK13-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK13-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK13-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK13-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK13-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK13-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK13-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK13-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK13-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK13-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK13: omp.precond.then: +// CHECK13-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK13-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK13-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK13-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK13-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK13-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK13-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK13-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK13-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK13-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK13-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK13: cond.true: +// CHECK13-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK13-NEXT: br label [[COND_END:%.*]] +// CHECK13: cond.false: +// CHECK13-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: br label [[COND_END]] +// CHECK13: cond.end: +// CHECK13-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK13-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK13-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK13: omp.inner.for.cond: +// CHECK13-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK13-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK13-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK13-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK13: omp.inner.for.body: +// CHECK13-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK13-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK13-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 +// CHECK13-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK13-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK13-NEXT: [[TMP19:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK13-NEXT: store i8* [[TMP19]], i8** [[TMP18]], align 4 +// CHECK13-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK13-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK13-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 +// CHECK13-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK13-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK13-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 +// CHECK13-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK13-NEXT: [[TMP25:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* +// CHECK13-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 +// CHECK13-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK13-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK13-NEXT: [[TMP28:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK13-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP27]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP28]], i32 4) +// CHECK13-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK13: omp.inner.for.inc: +// CHECK13-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK13-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP29]], [[TMP30]] +// CHECK13-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK13-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK13-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] +// CHECK13-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK13-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK13-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK13-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK13-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] +// CHECK13-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] +// CHECK13: cond.true10: +// CHECK13-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK13-NEXT: br label [[COND_END12:%.*]] +// CHECK13: cond.false11: +// CHECK13-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: br label [[COND_END12]] +// CHECK13: cond.end12: +// CHECK13-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP37]], [[COND_TRUE10]] ], [ [[TMP38]], [[COND_FALSE11]] ] +// CHECK13-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK13-NEXT: store i32 [[TMP39]], i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK13: omp.inner.for.end: +// CHECK13-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK13: omp.loop.exit: +// CHECK13-NEXT: [[TMP40:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK13-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 +// CHECK13-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]]) +// CHECK13-NEXT: br label [[OMP_PRECOND_END]] +// CHECK13: omp.precond.end: +// CHECK13-NEXT: ret void +// CHECK13-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK13-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK13-NEXT: entry: +// CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK13-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK13-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK13-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK13-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK13-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK13-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK13-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK13-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK13-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK13-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK13-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK13-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK13-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK13-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK13-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK13-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK13-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK13-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK13-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK13: omp.precond.then: +// CHECK13-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK13-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK13-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK13-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK13-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK13-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK13-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK13-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK13-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK13-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK13-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK13-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK13-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK13-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK13: omp.inner.for.cond: +// CHECK13-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK13-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] +// CHECK13-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK13: omp.inner.for.body: +// CHECK13-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK13-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK13-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK13-NEXT: [[TMP13:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK13-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i32 0, i32 [[TMP13]] +// CHECK13-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +// CHECK13-NEXT: [[CONV:%.*]] = sext i16 [[TMP14]] to i32 +// CHECK13-NEXT: [[ADD5:%.*]] = add nsw i32 [[CONV]], 1 +// CHECK13-NEXT: [[CONV6:%.*]] = trunc i32 [[ADD5]] to i16 +// CHECK13-NEXT: store i16 [[CONV6]], i16* [[ARRAYIDX]], align 2 +// CHECK13-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK13: omp.body.continue: +// CHECK13-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK13: omp.inner.for.inc: +// CHECK13-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK13-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] +// CHECK13-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK13: omp.inner.for.end: +// CHECK13-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK13: omp.loop.exit: +// CHECK13-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK13-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 +// CHECK13-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) +// CHECK13-NEXT: br label [[OMP_PRECOND_END]] +// CHECK13: omp.precond.end: +// CHECK13-NEXT: ret void +// CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 +// CHECK13-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK13-NEXT: entry: +// CHECK13-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK13-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK13-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK13-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK13-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK13-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK13-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK13: .execute: +// CHECK13-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK13-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK13-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] +// CHECK13-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK13: .omp.deinit: +// CHECK13-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK13-NEXT: br label [[DOTEXIT:%.*]] +// CHECK13: .exit: +// CHECK13-NEXT: ret void +// CHECK13-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK13-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK13-NEXT: entry: +// CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK13-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK13-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 +// CHECK13-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK13-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK13-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK13-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK13-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK13-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK13-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK13-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK13-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK13-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK13-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK13-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 +// CHECK13-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK13: cond.true: +// CHECK13-NEXT: br label [[COND_END:%.*]] +// CHECK13: cond.false: +// CHECK13-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: br label [[COND_END]] +// CHECK13: cond.end: +// CHECK13-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK13-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK13-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK13: omp.inner.for.cond: +// CHECK13-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 +// CHECK13-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK13: omp.inner.for.body: +// CHECK13-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK13-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK13-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK13-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK13-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK13-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK13-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK13-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK13-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK13-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK13-NEXT: [[TMP15:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK13-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP15]], i32 3) +// CHECK13-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK13: omp.inner.for.inc: +// CHECK13-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK13-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP17]] +// CHECK13-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK13-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK13-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] +// CHECK13-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK13-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK13-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK13-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP22]], 9 +// CHECK13-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] +// CHECK13: cond.true5: +// CHECK13-NEXT: br label [[COND_END7:%.*]] +// CHECK13: cond.false6: +// CHECK13-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: br label [[COND_END7]] +// CHECK13: cond.end7: +// CHECK13-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP23]], [[COND_FALSE6]] ] +// CHECK13-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK13-NEXT: store i32 [[TMP24]], i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK13: omp.inner.for.end: +// CHECK13-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK13: omp.loop.exit: +// CHECK13-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK13-NEXT: ret void +// CHECK13-LABEL: define {{[^@]+}}@__omp_outlined__5 +// CHECK13-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK13-NEXT: entry: +// CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK13-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK13-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK13-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK13-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK13-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK13-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK13-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK13-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK13-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK13-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK13-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK13-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK13-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK13-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK13-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK13-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK13-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK13-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK13-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK13-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK13: omp.inner.for.cond: +// CHECK13-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK13-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK13-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK13: omp.inner.for.body: +// CHECK13-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 +// CHECK13-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK13-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK13-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 +// CHECK13-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 [[TMP9]] +// CHECK13-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK13-NEXT: [[ADD1:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK13-NEXT: store i32 [[ADD1]], i32* [[ARRAYIDX]], align 4 +// CHECK13-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK13: omp.body.continue: +// CHECK13-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK13: omp.inner.for.inc: +// CHECK13-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK13-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] +// CHECK13-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK13: omp.inner.for.end: +// CHECK13-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK13: omp.loop.exit: +// CHECK13-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK13-NEXT: ret void +// CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59 +// CHECK13-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK13-NEXT: entry: +// CHECK13-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK13-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK13-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK13-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK13-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK13-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK13-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK13-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK13: .execute: +// CHECK13-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK13-NEXT: [[TMP2:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK13-NEXT: store i32 [[TMP2]], i32* [[F_CASTED]], align 4 +// CHECK13-NEXT: [[TMP3:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK13-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK13-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i32 [[TMP3]]) #[[ATTR2]] +// CHECK13-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK13: .omp.deinit: +// CHECK13-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK13-NEXT: br label [[DOTEXIT:%.*]] +// CHECK13: .exit: +// CHECK13-NEXT: ret void +// CHECK13-LABEL: define {{[^@]+}}@__omp_outlined__6 +// CHECK13-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK13-NEXT: entry: +// CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK13-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK13-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK13-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK13-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK13-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK13-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK13-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK13-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK13-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK13-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK13-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK13-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK13-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK13-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK13-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 +// CHECK13-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK13: cond.true: +// CHECK13-NEXT: br label [[COND_END:%.*]] +// CHECK13: cond.false: +// CHECK13-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: br label [[COND_END]] +// CHECK13: cond.end: +// CHECK13-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK13-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK13-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK13: omp.inner.for.cond: +// CHECK13-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 +// CHECK13-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK13: omp.inner.for.body: +// CHECK13-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK13-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: [[TMP9:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK13-NEXT: store i32 [[TMP9]], i32* [[F_CASTED]], align 4 +// CHECK13-NEXT: [[TMP10:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK13-NEXT: [[TMP11:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK13-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK13-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK13-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK13-NEXT: [[TMP14:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK13-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK13-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK13-NEXT: [[TMP16:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK13-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 4 +// CHECK13-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK13-NEXT: [[TMP18:%.*]] = inttoptr i32 [[TMP10]] to i8* +// CHECK13-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 4 +// CHECK13-NEXT: [[TMP19:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK13-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x [10 x i32]]*, i32)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP19]], i32 4) +// CHECK13-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK13: omp.inner.for.inc: +// CHECK13-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK13-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK13-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK13-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK13-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK13-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK13-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK13-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK13-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP26]], 99 +// CHECK13-NEXT: br i1 [[CMP5]], label [[COND_TRUE6:%.*]], label [[COND_FALSE7:%.*]] +// CHECK13: cond.true6: +// CHECK13-NEXT: br label [[COND_END8:%.*]] +// CHECK13: cond.false7: +// CHECK13-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: br label [[COND_END8]] +// CHECK13: cond.end8: +// CHECK13-NEXT: [[COND9:%.*]] = phi i32 [ 99, [[COND_TRUE6]] ], [ [[TMP27]], [[COND_FALSE7]] ] +// CHECK13-NEXT: store i32 [[COND9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK13-NEXT: store i32 [[TMP28]], i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK13: omp.inner.for.end: +// CHECK13-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK13: omp.loop.exit: +// CHECK13-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK13-NEXT: ret void +// CHECK13-LABEL: define {{[^@]+}}@__omp_outlined__7 +// CHECK13-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK13-NEXT: entry: +// CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK13-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK13-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK13-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK13-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK13-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK13-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK13-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK13-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK13-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK13-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 +// CHECK13-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK13-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK13-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK13-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK13-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK13-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK13-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK13-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK13-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK13-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK13-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK13: omp.inner.for.cond: +// CHECK13-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK13-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK13-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK13: omp.inner.for.body: +// CHECK13-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 +// CHECK13-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 +// CHECK13-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK13-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK13-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: [[DIV2:%.*]] = sdiv i32 [[TMP10]], 10 +// CHECK13-NEXT: [[MUL3:%.*]] = mul nsw i32 [[DIV2]], 10 +// CHECK13-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL3]] +// CHECK13-NEXT: [[MUL4:%.*]] = mul nsw i32 [[SUB]], 1 +// CHECK13-NEXT: [[ADD5:%.*]] = add nsw i32 0, [[MUL4]] +// CHECK13-NEXT: store i32 [[ADD5]], i32* [[J]], align 4 +// CHECK13-NEXT: store i32 10, i32* [[K]], align 4 +// CHECK13-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 +// CHECK13-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 +// CHECK13-NEXT: [[TMP13:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK13-NEXT: [[MUL6:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] +// CHECK13-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP11]], [[MUL6]] +// CHECK13-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 +// CHECK13-NEXT: [[ADD8:%.*]] = add nsw i32 [[ADD7]], [[TMP14]] +// CHECK13-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 +// CHECK13-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP15]] +// CHECK13-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 +// CHECK13-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP16]] +// CHECK13-NEXT: store i32 [[ADD8]], i32* [[ARRAYIDX9]], align 4 +// CHECK13-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK13: omp.body.continue: +// CHECK13-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK13: omp.inner.for.inc: +// CHECK13-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK13-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK13-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK13: omp.inner.for.end: +// CHECK13-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK13: omp.loop.exit: +// CHECK13-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK13-NEXT: ret void +// CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l67 +// CHECK13-SAME: (i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK13-NEXT: entry: +// CHECK13-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 // CHECK13-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 // CHECK13-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 // CHECK13-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 // CHECK13-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 // CHECK13-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK13-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK13-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 -// CHECK13-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK13-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK13-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 // CHECK13-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK13-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) // CHECK13-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK13: .execute: -// CHECK13-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK13-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) // CHECK13-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 // CHECK13-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 // CHECK13-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK13-NEXT: [[TMP4:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK13-NEXT: store i32 [[TMP4]], i32* [[L_CASTED]], align 4 -// CHECK13-NEXT: [[TMP5:%.*]] = load i32, i32* [[L_CASTED]], align 4 // CHECK13-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK13-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32 [[TMP5]]) #[[ATTR2:[0-9]+]] +// CHECK13-NEXT: call void @__omp_outlined__8(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [10 x [10 x i32]]* [[TMP0]]) #[[ATTR2]] // CHECK13-NEXT: br label [[DOTOMP_DEINIT:%.*]] // CHECK13: .omp.deinit: // CHECK13-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) // CHECK13-NEXT: br label [[DOTEXIT:%.*]] // CHECK13: .exit: // CHECK13-NEXT: ret void -// -// -// CHECK13-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK13-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK13-LABEL: define {{[^@]+}}@__omp_outlined__8 +// CHECK13-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 // CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 // CHECK13-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK13-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK13-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 // CHECK13-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 // CHECK13-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 // CHECK13-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 // CHECK13-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i64, align 8 +// CHECK13-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i64, align 8 +// CHECK13-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 // CHECK13-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[I4:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[I9:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[J10:%.*]] = alloca i32, align 4 // CHECK13-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 +// CHECK13-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 // CHECK13-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 // CHECK13-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 // CHECK13-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK13-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK13-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 -// CHECK13-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK13-NEXT: [[TMP1:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK13-NEXT: [[TMP2:%.*]] = load i32, i32* @"_openmp_static_kernel$size", align 4 -// CHECK13-NEXT: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP2]], i16 [[TMP1]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) -// CHECK13-NEXT: [[TMP3:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4 -// CHECK13-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP3]], i32 0 -// CHECK13-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to %struct._globalized_locals_ty* -// CHECK13-NEXT: [[L1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP5]], i32 0, i32 0 -// CHECK13-NEXT: [[TMP6:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK13-NEXT: store i32 [[TMP6]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK13-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK13-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP7]], 0 +// CHECK13-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK13-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK13-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK13-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK13-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK13-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK13-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK13-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 // CHECK13-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK13-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK13-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK13-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64 +// CHECK13-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK13-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK13-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 +// CHECK13-NEXT: [[CONV6:%.*]] = sext i32 [[DIV5]] to i64 +// CHECK13-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV6]] +// CHECK13-NEXT: [[SUB7:%.*]] = sub nsw i64 [[MUL]], 1 +// CHECK13-NEXT: store i64 [[SUB7]], i64* [[DOTCAPTURE_EXPR_3]], align 8 // CHECK13-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK13-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK13-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP8]] -// CHECK13-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK13-NEXT: store i32 0, i32* [[J]], align 4 +// CHECK13-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK13-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK13-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK13: land.lhs.true: +// CHECK13-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK13-NEXT: [[CMP8:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK13-NEXT: br i1 [[CMP8]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] // CHECK13: omp.precond.then: -// CHECK13-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK13-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK13-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK13-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK13-NEXT: store i64 0, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK13-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK13-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK13-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 // CHECK13-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK13-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK13-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 -// CHECK13-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP11]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) -// CHECK13-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK13-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK13-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]] -// CHECK13-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK13-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK13-NEXT: [[CONV11:%.*]] = zext i32 [[NVPTX_NUM_THREADS]] to i64 +// CHECK13-NEXT: [[TMP8:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK13-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4 +// CHECK13-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB1]], i32 [[TMP9]], i32 91, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_COMB_LB]], i64* [[DOTOMP_COMB_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 [[CONV11]]) +// CHECK13-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK13-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK13-NEXT: [[CMP12:%.*]] = icmp sgt i64 [[TMP10]], [[TMP11]] +// CHECK13-NEXT: br i1 [[CMP12]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] // CHECK13: cond.true: -// CHECK13-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK13-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 // CHECK13-NEXT: br label [[COND_END:%.*]] // CHECK13: cond.false: -// CHECK13-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 // CHECK13-NEXT: br label [[COND_END]] // CHECK13: cond.end: -// CHECK13-NEXT: [[COND:%.*]] = phi i32 [ [[TMP14]], [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ] -// CHECK13-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK13-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK13-NEXT: store i32 [[TMP16]], i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: [[COND:%.*]] = phi i64 [ [[TMP12]], [[COND_TRUE]] ], [ [[TMP13]], [[COND_FALSE]] ] +// CHECK13-NEXT: store i64 [[COND]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK13-NEXT: [[TMP14:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK13-NEXT: store i64 [[TMP14]], i64* [[DOTOMP_IV]], align 8 // CHECK13-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] // CHECK13: omp.inner.for.cond: -// CHECK13-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK13-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK13-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], 1 -// CHECK13-NEXT: [[CMP6:%.*]] = icmp slt i32 [[TMP17]], [[ADD]] -// CHECK13-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK13-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK13-NEXT: [[TMP16:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK13-NEXT: [[ADD:%.*]] = add nsw i64 [[TMP16]], 1 +// CHECK13-NEXT: [[CMP13:%.*]] = icmp slt i64 [[TMP15]], [[ADD]] +// CHECK13-NEXT: br i1 [[CMP13]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] // CHECK13: omp.inner.for.body: -// CHECK13-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK13-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK13-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32 +// CHECK13-NEXT: [[TMP19:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK13-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32 // CHECK13-NEXT: [[TMP21:%.*]] = load i32, i32* [[N_ADDR]], align 4 // CHECK13-NEXT: store i32 [[TMP21]], i32* [[N_CASTED]], align 4 // CHECK13-NEXT: [[TMP22:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK13-NEXT: [[TMP23:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK13-NEXT: store i32 [[TMP23]], i32* [[L_CASTED]], align 4 -// CHECK13-NEXT: [[TMP24:%.*]] = load i32, i32* [[L_CASTED]], align 4 -// CHECK13-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK13-NEXT: [[TMP26:%.*]] = inttoptr i32 [[TMP19]] to i8* +// CHECK13-NEXT: [[TMP23:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK13-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP18]] to i8* +// CHECK13-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 +// CHECK13-NEXT: [[TMP25:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK13-NEXT: [[TMP26:%.*]] = inttoptr i32 [[TMP20]] to i8* // CHECK13-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 -// CHECK13-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK13-NEXT: [[TMP28:%.*]] = inttoptr i32 [[TMP20]] to i8* +// CHECK13-NEXT: [[TMP27:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK13-NEXT: [[TMP28:%.*]] = inttoptr i32 [[TMP22]] to i8* // CHECK13-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 -// CHECK13-NEXT: [[TMP29:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK13-NEXT: [[TMP30:%.*]] = inttoptr i32 [[TMP22]] to i8* +// CHECK13-NEXT: [[TMP29:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK13-NEXT: [[TMP30:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* // CHECK13-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 4 -// CHECK13-NEXT: [[TMP31:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK13-NEXT: [[TMP32:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* -// CHECK13-NEXT: store i8* [[TMP32]], i8** [[TMP31]], align 4 -// CHECK13-NEXT: [[TMP33:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 -// CHECK13-NEXT: [[TMP34:%.*]] = inttoptr i32 [[TMP24]] to i8* -// CHECK13-NEXT: store i8* [[TMP34]], i8** [[TMP33]], align 4 -// CHECK13-NEXT: [[TMP35:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK13-NEXT: [[TMP36:%.*]] = load i32, i32* [[TMP35]], align 4 -// CHECK13-NEXT: [[TMP37:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK13-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP36]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP37]], i32 5) +// CHECK13-NEXT: [[TMP31:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK13-NEXT: [[TMP32:%.*]] = load i32, i32* [[TMP31]], align 4 +// CHECK13-NEXT: [[TMP33:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK13-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP32]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [10 x [10 x i32]]*)* @__omp_outlined__9 to i8*), i8* null, i8** [[TMP33]], i32 4) // CHECK13-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] // CHECK13: omp.inner.for.inc: -// CHECK13-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK13-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK13-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP38]], [[TMP39]] -// CHECK13-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK13-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK13-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK13-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP40]], [[TMP41]] -// CHECK13-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK13-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK13-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK13-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP42]], [[TMP43]] -// CHECK13-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK13-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK13-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK13-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP44]], [[TMP45]] -// CHECK13-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] -// CHECK13: cond.true11: -// CHECK13-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK13-NEXT: br label [[COND_END13:%.*]] -// CHECK13: cond.false12: -// CHECK13-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK13-NEXT: br label [[COND_END13]] -// CHECK13: cond.end13: -// CHECK13-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP46]], [[COND_TRUE11]] ], [ [[TMP47]], [[COND_FALSE12]] ] -// CHECK13-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK13-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK13-NEXT: store i32 [[TMP48]], i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: [[TMP34:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK13-NEXT: [[TMP35:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK13-NEXT: [[ADD14:%.*]] = add nsw i64 [[TMP34]], [[TMP35]] +// CHECK13-NEXT: store i64 [[ADD14]], i64* [[DOTOMP_IV]], align 8 +// CHECK13-NEXT: [[TMP36:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK13-NEXT: [[TMP37:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK13-NEXT: [[ADD15:%.*]] = add nsw i64 [[TMP36]], [[TMP37]] +// CHECK13-NEXT: store i64 [[ADD15]], i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK13-NEXT: [[TMP38:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK13-NEXT: [[TMP39:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK13-NEXT: [[ADD16:%.*]] = add nsw i64 [[TMP38]], [[TMP39]] +// CHECK13-NEXT: store i64 [[ADD16]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK13-NEXT: [[TMP40:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK13-NEXT: [[TMP41:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK13-NEXT: [[CMP17:%.*]] = icmp sgt i64 [[TMP40]], [[TMP41]] +// CHECK13-NEXT: br i1 [[CMP17]], label [[COND_TRUE18:%.*]], label [[COND_FALSE19:%.*]] +// CHECK13: cond.true18: +// CHECK13-NEXT: [[TMP42:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK13-NEXT: br label [[COND_END20:%.*]] +// CHECK13: cond.false19: +// CHECK13-NEXT: [[TMP43:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK13-NEXT: br label [[COND_END20]] +// CHECK13: cond.end20: +// CHECK13-NEXT: [[COND21:%.*]] = phi i64 [ [[TMP42]], [[COND_TRUE18]] ], [ [[TMP43]], [[COND_FALSE19]] ] +// CHECK13-NEXT: store i64 [[COND21]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK13-NEXT: [[TMP44:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK13-NEXT: store i64 [[TMP44]], i64* [[DOTOMP_IV]], align 8 // CHECK13-NEXT: br label [[OMP_INNER_FOR_COND]] // CHECK13: omp.inner.for.end: // CHECK13-NEXT: br label [[OMP_LOOP_EXIT:%.*]] // CHECK13: omp.loop.exit: -// CHECK13-NEXT: [[TMP49:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK13-NEXT: [[TMP50:%.*]] = load i32, i32* [[TMP49]], align 4 -// CHECK13-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP50]]) -// CHECK13-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK13-NEXT: [[TMP52:%.*]] = icmp ne i32 [[TMP51]], 0 -// CHECK13-NEXT: br i1 [[TMP52]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK13: .omp.lastprivate.then: -// CHECK13-NEXT: [[TMP53:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK13-NEXT: store i32 [[TMP53]], i32* [[L_ADDR]], align 4 -// CHECK13-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK13: .omp.lastprivate.done: +// CHECK13-NEXT: [[TMP45:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK13-NEXT: [[TMP46:%.*]] = load i32, i32* [[TMP45]], align 4 +// CHECK13-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP46]]) // CHECK13-NEXT: br label [[OMP_PRECOND_END]] // CHECK13: omp.precond.end: -// CHECK13-NEXT: [[TMP54:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK13-NEXT: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[TMP54]]) // CHECK13-NEXT: ret void -// -// -// CHECK13-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK13-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK13-LABEL: define {{[^@]+}}@__omp_outlined__9 +// CHECK13-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 // CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 // CHECK13-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 // CHECK13-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 // CHECK13-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK13-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK13-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 // CHECK13-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 // CHECK13-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 // CHECK13-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[DOTOMP_LB:%.*]] = alloca i64, align 8 +// CHECK13-NEXT: [[DOTOMP_UB:%.*]] = alloca i64, align 8 +// CHECK13-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 // CHECK13-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[I11:%.*]] = alloca i32, align 4 +// CHECK13-NEXT: [[J12:%.*]] = alloca i32, align 4 // CHECK13-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 // CHECK13-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 // CHECK13-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 // CHECK13-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 // CHECK13-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK13-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK13-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 -// CHECK13-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK13-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK13-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 // CHECK13-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 // CHECK13-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK13-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK13-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK13-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK13-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK13-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK13-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 // CHECK13-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK13-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK13-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK13-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64 +// CHECK13-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK13-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK13-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 +// CHECK13-NEXT: [[CONV6:%.*]] = sext i32 [[DIV5]] to i64 +// CHECK13-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV6]] +// CHECK13-NEXT: [[SUB7:%.*]] = sub nsw i64 [[MUL]], 1 +// CHECK13-NEXT: store i64 [[SUB7]], i64* [[DOTCAPTURE_EXPR_3]], align 8 // CHECK13-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK13-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK13-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK13-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK13-NEXT: store i32 0, i32* [[J]], align 4 +// CHECK13-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK13-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK13-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK13: land.lhs.true: +// CHECK13-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK13-NEXT: [[CMP8:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK13-NEXT: br i1 [[CMP8]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] // CHECK13: omp.precond.then: -// CHECK13-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK13-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK13-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK13-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK13-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK13-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 -// CHECK13-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 -// CHECK13-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK13-NEXT: store i64 0, i64* [[DOTOMP_LB]], align 8 +// CHECK13-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK13-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_UB]], align 8 +// CHECK13-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK13-NEXT: [[CONV9:%.*]] = zext i32 [[TMP8]] to i64 +// CHECK13-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK13-NEXT: [[CONV10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK13-NEXT: store i64 [[CONV9]], i64* [[DOTOMP_LB]], align 8 +// CHECK13-NEXT: store i64 [[CONV10]], i64* [[DOTOMP_UB]], align 8 +// CHECK13-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 // CHECK13-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK13-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK13-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK13-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) -// CHECK13-NEXT: br label [[OMP_DISPATCH_COND:%.*]] -// CHECK13: omp.dispatch.cond: -// CHECK13-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK13-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK13-NEXT: [[CMP4:%.*]] = icmp ugt i32 [[TMP9]], [[TMP10]] -// CHECK13-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK13: cond.true: -// CHECK13-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK13-NEXT: br label [[COND_END:%.*]] -// CHECK13: cond.false: -// CHECK13-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK13-NEXT: br label [[COND_END]] -// CHECK13: cond.end: -// CHECK13-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] -// CHECK13-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 -// CHECK13-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK13-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 -// CHECK13-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK13-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK13-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] -// CHECK13-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] -// CHECK13: omp.dispatch.body: +// CHECK13-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK13-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK13-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB2]], i32 [[TMP11]], i32 33, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_LB]], i64* [[DOTOMP_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 1) +// CHECK13-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTOMP_LB]], align 8 +// CHECK13-NEXT: store i64 [[TMP12]], i64* [[DOTOMP_IV]], align 8 // CHECK13-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] // CHECK13: omp.inner.for.cond: -// CHECK13-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK13-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK13-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] -// CHECK13-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK13-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK13-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK13-NEXT: [[CONV13:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK13-NEXT: [[CMP14:%.*]] = icmp sle i64 [[TMP13]], [[CONV13]] +// CHECK13-NEXT: br i1 [[CMP14]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] // CHECK13: omp.inner.for.body: -// CHECK13-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK13-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 -// CHECK13-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK13-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 -// CHECK13-NEXT: [[TMP19:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK13-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP19]] -// CHECK13-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 -// CHECK13-NEXT: [[TMP20:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK13-NEXT: store i32 [[TMP20]], i32* [[L_ADDR]], align 4 +// CHECK13-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK13-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK13-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP16]], 0 +// CHECK13-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1 +// CHECK13-NEXT: [[MUL17:%.*]] = mul nsw i32 1, [[DIV16]] +// CHECK13-NEXT: [[CONV18:%.*]] = sext i32 [[MUL17]] to i64 +// CHECK13-NEXT: [[DIV19:%.*]] = sdiv i64 [[TMP15]], [[CONV18]] +// CHECK13-NEXT: [[MUL20:%.*]] = mul nsw i64 [[DIV19]], 1 +// CHECK13-NEXT: [[ADD:%.*]] = add nsw i64 0, [[MUL20]] +// CHECK13-NEXT: [[CONV21:%.*]] = trunc i64 [[ADD]] to i32 +// CHECK13-NEXT: store i32 [[CONV21]], i32* [[I11]], align 4 +// CHECK13-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK13-NEXT: [[TMP18:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK13-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK13-NEXT: [[SUB22:%.*]] = sub nsw i32 [[TMP19]], 0 +// CHECK13-NEXT: [[DIV23:%.*]] = sdiv i32 [[SUB22]], 1 +// CHECK13-NEXT: [[MUL24:%.*]] = mul nsw i32 1, [[DIV23]] +// CHECK13-NEXT: [[CONV25:%.*]] = sext i32 [[MUL24]] to i64 +// CHECK13-NEXT: [[DIV26:%.*]] = sdiv i64 [[TMP18]], [[CONV25]] +// CHECK13-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK13-NEXT: [[SUB27:%.*]] = sub nsw i32 [[TMP20]], 0 +// CHECK13-NEXT: [[DIV28:%.*]] = sdiv i32 [[SUB27]], 1 +// CHECK13-NEXT: [[MUL29:%.*]] = mul nsw i32 1, [[DIV28]] +// CHECK13-NEXT: [[CONV30:%.*]] = sext i32 [[MUL29]] to i64 +// CHECK13-NEXT: [[MUL31:%.*]] = mul nsw i64 [[DIV26]], [[CONV30]] +// CHECK13-NEXT: [[SUB32:%.*]] = sub nsw i64 [[TMP17]], [[MUL31]] +// CHECK13-NEXT: [[MUL33:%.*]] = mul nsw i64 [[SUB32]], 1 +// CHECK13-NEXT: [[ADD34:%.*]] = add nsw i64 0, [[MUL33]] +// CHECK13-NEXT: [[CONV35:%.*]] = trunc i64 [[ADD34]] to i32 +// CHECK13-NEXT: store i32 [[CONV35]], i32* [[J12]], align 4 +// CHECK13-NEXT: [[TMP21:%.*]] = load i32, i32* [[I11]], align 4 +// CHECK13-NEXT: [[TMP22:%.*]] = load i32, i32* [[J12]], align 4 +// CHECK13-NEXT: [[ADD36:%.*]] = add nsw i32 [[TMP21]], [[TMP22]] +// CHECK13-NEXT: [[TMP23:%.*]] = load i32, i32* [[I11]], align 4 +// CHECK13-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP23]] +// CHECK13-NEXT: [[TMP24:%.*]] = load i32, i32* [[J12]], align 4 +// CHECK13-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP24]] +// CHECK13-NEXT: store i32 [[ADD36]], i32* [[ARRAYIDX37]], align 4 // CHECK13-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK13: omp.body.continue: // CHECK13-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] // CHECK13: omp.inner.for.inc: -// CHECK13-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK13-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP21]], 1 -// CHECK13-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: [[TMP25:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK13-NEXT: [[TMP26:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK13-NEXT: [[ADD38:%.*]] = add nsw i64 [[TMP25]], [[TMP26]] +// CHECK13-NEXT: store i64 [[ADD38]], i64* [[DOTOMP_IV]], align 8 // CHECK13-NEXT: br label [[OMP_INNER_FOR_COND]] // CHECK13: omp.inner.for.end: -// CHECK13-NEXT: br label [[OMP_DISPATCH_INC:%.*]] -// CHECK13: omp.dispatch.inc: -// CHECK13-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK13-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK13-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK13-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_LB]], align 4 -// CHECK13-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK13-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK13-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] -// CHECK13-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_UB]], align 4 -// CHECK13-NEXT: br label [[OMP_DISPATCH_COND]] -// CHECK13: omp.dispatch.end: -// CHECK13-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK13-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 -// CHECK13-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) -// CHECK13-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK13-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 -// CHECK13-NEXT: br i1 [[TMP29]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK13: .omp.lastprivate.then: -// CHECK13-NEXT: [[TMP30:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK13-NEXT: store i32 [[TMP30]], i32* [[L_ADDR]], align 4 -// CHECK13-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK13: .omp.lastprivate.done: +// CHECK13-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK13: omp.loop.exit: +// CHECK13-NEXT: [[TMP27:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK13-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP27]], align 4 +// CHECK13-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP28]]) // CHECK13-NEXT: br label [[OMP_PRECOND_END]] // CHECK13: omp.precond.end: // CHECK13-NEXT: ret void -// -// -// CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 -// CHECK13-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74 +// CHECK13-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK13-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK13-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 // CHECK13-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 // CHECK13-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 // CHECK13-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 // CHECK13-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 // CHECK13-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK13-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK13-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK13-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK13-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 +// CHECK13-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 // CHECK13-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK13-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) // CHECK13-NEXT: br label [[DOTEXECUTE:%.*]] @@ -21038,23 +9158,23 @@ // CHECK13-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 // CHECK13-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 // CHECK13-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK13-NEXT: [[TMP4:%.*]] = load i32*, i32** [[V_ADDR]], align 4 // CHECK13-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK13-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] +// CHECK13-NEXT: call void @__omp_outlined__10(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32* [[TMP4]]) #[[ATTR2]] // CHECK13-NEXT: br label [[DOTOMP_DEINIT:%.*]] // CHECK13: .omp.deinit: // CHECK13-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) // CHECK13-NEXT: br label [[DOTEXIT:%.*]] // CHECK13: .exit: // CHECK13-NEXT: ret void -// -// -// CHECK13-LABEL: define {{[^@]+}}@__omp_outlined__2 -// CHECK13-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK13-LABEL: define {{[^@]+}}@__omp_outlined__10 +// CHECK13-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 // CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 // CHECK13-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK13-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK13-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 // CHECK13-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 // CHECK13-NEXT: [[TMP:%.*]] = alloca i32, align 4 // CHECK13-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 @@ -21066,12 +9186,13 @@ // CHECK13-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 // CHECK13-NEXT: [[I3:%.*]] = alloca i32, align 4 // CHECK13-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK13-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 // CHECK13-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 // CHECK13-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 // CHECK13-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK13-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK13-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK13-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK13-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 +// CHECK13-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 // CHECK13-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 // CHECK13-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 // CHECK13-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 @@ -21121,72 +9242,75 @@ // CHECK13-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 // CHECK13-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 // CHECK13-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK13-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK13-NEXT: [[TMP19:%.*]] = inttoptr i32 [[TMP14]] to i8* -// CHECK13-NEXT: store i8* [[TMP19]], i8** [[TMP18]], align 4 -// CHECK13-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK13-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP15]] to i8* -// CHECK13-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 -// CHECK13-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK13-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP17]] to i8* -// CHECK13-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 -// CHECK13-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK13-NEXT: [[TMP25:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* -// CHECK13-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 -// CHECK13-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK13-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 -// CHECK13-NEXT: [[TMP28:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK13-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP27]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP28]], i32 4) +// CHECK13-NEXT: [[TMP18:%.*]] = load i32*, i32** [[V_ADDR]], align 4 +// CHECK13-NEXT: [[TMP19:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK13-NEXT: [[TMP20:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK13-NEXT: store i8* [[TMP20]], i8** [[TMP19]], align 4 +// CHECK13-NEXT: [[TMP21:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK13-NEXT: [[TMP22:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK13-NEXT: store i8* [[TMP22]], i8** [[TMP21]], align 4 +// CHECK13-NEXT: [[TMP23:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK13-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK13-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 +// CHECK13-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK13-NEXT: [[TMP26:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK13-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 +// CHECK13-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK13-NEXT: [[TMP28:%.*]] = bitcast i32* [[TMP18]] to i8* +// CHECK13-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 +// CHECK13-NEXT: [[TMP29:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK13-NEXT: [[TMP30:%.*]] = load i32, i32* [[TMP29]], align 4 +// CHECK13-NEXT: [[TMP31:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK13-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP30]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32*)* @__omp_outlined__11 to i8*), i8* null, i8** [[TMP31]], i32 5) // CHECK13-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] // CHECK13: omp.inner.for.inc: -// CHECK13-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK13-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK13-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP29]], [[TMP30]] +// CHECK13-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK13-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP32]], [[TMP33]] // CHECK13-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 -// CHECK13-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK13-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK13-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] +// CHECK13-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK13-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK13-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP34]], [[TMP35]] // CHECK13-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK13-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK13-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK13-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK13-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK13-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP36]], [[TMP37]] // CHECK13-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK13-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK13-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK13-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] +// CHECK13-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK13-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP38]], [[TMP39]] // CHECK13-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] // CHECK13: cond.true10: -// CHECK13-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK13-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 // CHECK13-NEXT: br label [[COND_END12:%.*]] // CHECK13: cond.false11: -// CHECK13-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK13-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 // CHECK13-NEXT: br label [[COND_END12]] // CHECK13: cond.end12: -// CHECK13-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP37]], [[COND_TRUE10]] ], [ [[TMP38]], [[COND_FALSE11]] ] +// CHECK13-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP40]], [[COND_TRUE10]] ], [ [[TMP41]], [[COND_FALSE11]] ] // CHECK13-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK13-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK13-NEXT: store i32 [[TMP39]], i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK13-NEXT: store i32 [[TMP42]], i32* [[DOTOMP_IV]], align 4 // CHECK13-NEXT: br label [[OMP_INNER_FOR_COND]] // CHECK13: omp.inner.for.end: // CHECK13-NEXT: br label [[OMP_LOOP_EXIT:%.*]] // CHECK13: omp.loop.exit: -// CHECK13-NEXT: [[TMP40:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK13-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 -// CHECK13-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]]) +// CHECK13-NEXT: [[TMP43:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK13-NEXT: [[TMP44:%.*]] = load i32, i32* [[TMP43]], align 4 +// CHECK13-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP44]]) // CHECK13-NEXT: br label [[OMP_PRECOND_END]] // CHECK13: omp.precond.end: // CHECK13-NEXT: ret void -// -// -// CHECK13-LABEL: define {{[^@]+}}@__omp_outlined__3 -// CHECK13-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK13-LABEL: define {{[^@]+}}@__omp_outlined__11 +// CHECK13-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 // CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 // CHECK13-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 // CHECK13-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 // CHECK13-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK13-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK13-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 // CHECK13-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 // CHECK13-NEXT: [[TMP:%.*]] = alloca i32, align 4 // CHECK13-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 @@ -21202,8 +9326,9 @@ // CHECK13-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 // CHECK13-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 // CHECK13-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK13-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK13-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK13-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK13-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 +// CHECK13-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 // CHECK13-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 // CHECK13-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 // CHECK13-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 @@ -21241,4107 +9366,15507 @@ // CHECK13-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 // CHECK13-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK13-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 -// CHECK13-NEXT: [[TMP13:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK13-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i32 0, i32 [[TMP13]] -// CHECK13-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 -// CHECK13-NEXT: [[CONV:%.*]] = sext i16 [[TMP14]] to i32 -// CHECK13-NEXT: [[ADD5:%.*]] = add nsw i32 [[CONV]], 1 -// CHECK13-NEXT: [[CONV6:%.*]] = trunc i32 [[ADD5]] to i16 -// CHECK13-NEXT: store i16 [[CONV6]], i16* [[ARRAYIDX]], align 2 +// CHECK13-NEXT: [[TMP13:%.*]] = load i32*, i32** [[V_ADDR]], align 4 +// CHECK13-NEXT: [[TMP14:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK13-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TMP13]], i32 [[TMP14]] +// CHECK13-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK13-NEXT: [[TMP16:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK13-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP16]] +// CHECK13-NEXT: store i32 [[TMP15]], i32* [[ARRAYIDX5]], align 4 // CHECK13-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK13: omp.body.continue: // CHECK13-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] // CHECK13: omp.inner.for.inc: -// CHECK13-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK13-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK13-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] -// CHECK13-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK13-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK13-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK13-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 // CHECK13-NEXT: br label [[OMP_INNER_FOR_COND]] // CHECK13: omp.inner.for.end: // CHECK13-NEXT: br label [[OMP_LOOP_EXIT:%.*]] // CHECK13: omp.loop.exit: -// CHECK13-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK13-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 -// CHECK13-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) +// CHECK13-NEXT: [[TMP19:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK13-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4 +// CHECK13-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP20]]) // CHECK13-NEXT: br label [[OMP_PRECOND_END]] // CHECK13: omp.precond.end: // CHECK13-NEXT: ret void +// CHECK14-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43 +// CHECK14-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK14-NEXT: entry: +// CHECK14-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK14-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK14-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK14-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK14-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK14-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK14: .execute: +// CHECK14-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK14-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK14-NEXT: [[TMP4:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP4]], i32* [[L_CASTED]], align 4 +// CHECK14-NEXT: [[TMP5:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK14-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK14-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32 [[TMP5]]) #[[ATTR2:[0-9]+]] +// CHECK14-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK14: .omp.deinit: +// CHECK14-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK14-NEXT: br label [[DOTEXIT:%.*]] +// CHECK14: .exit: +// CHECK14-NEXT: ret void +// CHECK14-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK14-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK14-NEXT: entry: +// CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK14-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK14-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[I4:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 +// CHECK14-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK14-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK14-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK14-NEXT: [[TMP1:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 4, i16 1) +// CHECK14-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to %struct._globalized_locals_ty* +// CHECK14-NEXT: [[L1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP2]], i32 0, i32 0 +// CHECK14-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP3]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK14-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK14-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK14-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK14-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK14-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK14-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK14-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK14-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK14-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK14: omp.precond.then: +// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK14-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK14-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK14-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP8]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) +// CHECK14-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK14-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]] +// CHECK14-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK14: cond.true: +// CHECK14-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK14-NEXT: br label [[COND_END:%.*]] +// CHECK14: cond.false: +// CHECK14-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: br label [[COND_END]] +// CHECK14: cond.end: +// CHECK14-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] +// CHECK14-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK14: omp.inner.for.cond: +// CHECK14-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK14-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP15]], 1 +// CHECK14-NEXT: [[CMP6:%.*]] = icmp slt i32 [[TMP14]], [[ADD]] +// CHECK14-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK14: omp.inner.for.body: +// CHECK14-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP18:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP18]], i32* [[N_CASTED]], align 4 +// CHECK14-NEXT: [[TMP19:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK14-NEXT: [[TMP20:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP20]], i32* [[L_CASTED]], align 4 +// CHECK14-NEXT: [[TMP21:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK14-NEXT: [[TMP22:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK14-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP16]] to i8* +// CHECK14-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 +// CHECK14-NEXT: [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK14-NEXT: [[TMP25:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK14-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 +// CHECK14-NEXT: [[TMP26:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK14-NEXT: [[TMP27:%.*]] = inttoptr i32 [[TMP19]] to i8* +// CHECK14-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 4 +// CHECK14-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK14-NEXT: [[TMP29:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK14-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 4 +// CHECK14-NEXT: [[TMP30:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK14-NEXT: [[TMP31:%.*]] = inttoptr i32 [[TMP21]] to i8* +// CHECK14-NEXT: store i8* [[TMP31]], i8** [[TMP30]], align 4 +// CHECK14-NEXT: [[TMP32:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: [[TMP33:%.*]] = load i32, i32* [[TMP32]], align 4 +// CHECK14-NEXT: [[TMP34:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK14-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP33]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP34]], i32 5) +// CHECK14-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK14: omp.inner.for.inc: +// CHECK14-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] +// CHECK14-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP37]], [[TMP38]] +// CHECK14-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP39]], [[TMP40]] +// CHECK14-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK14-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]] +// CHECK14-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK14: cond.true11: +// CHECK14-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK14-NEXT: br label [[COND_END13:%.*]] +// CHECK14: cond.false12: +// CHECK14-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: br label [[COND_END13]] +// CHECK14: cond.end13: +// CHECK14-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP43]], [[COND_TRUE11]] ], [ [[TMP44]], [[COND_FALSE12]] ] +// CHECK14-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: store i32 [[TMP45]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK14: omp.inner.for.end: +// CHECK14-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK14: omp.loop.exit: +// CHECK14-NEXT: [[TMP46:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: [[TMP47:%.*]] = load i32, i32* [[TMP46]], align 4 +// CHECK14-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP47]]) +// CHECK14-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK14-NEXT: [[TMP49:%.*]] = icmp ne i32 [[TMP48]], 0 +// CHECK14-NEXT: br i1 [[TMP49]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK14: .omp.lastprivate.then: +// CHECK14-NEXT: [[TMP50:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP50]], i32* [[L_ADDR]], align 4 +// CHECK14-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK14: .omp.lastprivate.done: +// CHECK14-NEXT: br label [[OMP_PRECOND_END]] +// CHECK14: omp.precond.end: +// CHECK14-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP1]]) +// CHECK14-NEXT: ret void +// CHECK14-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK14-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK14-NEXT: entry: +// CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK14-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK14-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK14-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK14-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK14-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK14-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK14-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK14-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK14-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK14-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK14-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK14-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK14-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK14-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK14-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK14: omp.precond.then: +// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK14-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK14-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK14-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK14-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK14-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK14-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK14-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK14-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) +// CHECK14-NEXT: br label [[OMP_DISPATCH_COND:%.*]] +// CHECK14: omp.dispatch.cond: +// CHECK14-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK14-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK14-NEXT: [[CMP4:%.*]] = icmp ugt i32 [[TMP9]], [[TMP10]] +// CHECK14-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK14: cond.true: +// CHECK14-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK14-NEXT: br label [[COND_END:%.*]] +// CHECK14: cond.false: +// CHECK14-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK14-NEXT: br label [[COND_END]] +// CHECK14: cond.end: +// CHECK14-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] +// CHECK14-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 +// CHECK14-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK14-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK14-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] +// CHECK14-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] +// CHECK14: omp.dispatch.body: +// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK14: omp.inner.for.cond: +// CHECK14-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK14-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] +// CHECK14-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK14: omp.inner.for.body: +// CHECK14-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK14-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK14-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK14-NEXT: [[TMP19:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK14-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP19]] +// CHECK14-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 +// CHECK14-NEXT: [[TMP20:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK14-NEXT: store i32 [[TMP20]], i32* [[L_ADDR]], align 4 +// CHECK14-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK14: omp.body.continue: +// CHECK14-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK14: omp.inner.for.inc: +// CHECK14-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP21]], 1 +// CHECK14-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK14: omp.inner.for.end: +// CHECK14-NEXT: br label [[OMP_DISPATCH_INC:%.*]] +// CHECK14: omp.dispatch.inc: +// CHECK14-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK14-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK14-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_LB]], align 4 +// CHECK14-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK14-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK14-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_UB]], align 4 +// CHECK14-NEXT: br label [[OMP_DISPATCH_COND]] +// CHECK14: omp.dispatch.end: +// CHECK14-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK14-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) +// CHECK14-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK14-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 +// CHECK14-NEXT: br i1 [[TMP29]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK14: .omp.lastprivate.then: +// CHECK14-NEXT: [[TMP30:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP30]], i32* [[L_ADDR]], align 4 +// CHECK14-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK14: .omp.lastprivate.done: +// CHECK14-NEXT: br label [[OMP_PRECOND_END]] +// CHECK14: omp.precond.end: +// CHECK14-NEXT: ret void +// CHECK14-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 +// CHECK14-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK14-NEXT: entry: +// CHECK14-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK14-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK14-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK14-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK14-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK14-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK14: .execute: +// CHECK14-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK14-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK14-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK14-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] +// CHECK14-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK14: .omp.deinit: +// CHECK14-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK14-NEXT: br label [[DOTEXIT:%.*]] +// CHECK14: .exit: +// CHECK14-NEXT: ret void +// CHECK14-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK14-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK14-NEXT: entry: +// CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK14-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK14-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK14-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK14-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK14-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK14-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK14-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK14-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK14-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK14-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK14-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK14-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK14-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK14-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK14: omp.precond.then: +// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK14-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK14-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK14-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK14-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK14-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK14-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK14-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK14: cond.true: +// CHECK14-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK14-NEXT: br label [[COND_END:%.*]] +// CHECK14: cond.false: +// CHECK14-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: br label [[COND_END]] +// CHECK14: cond.end: +// CHECK14-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK14-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK14: omp.inner.for.cond: +// CHECK14-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK14-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK14-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK14-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK14: omp.inner.for.body: +// CHECK14-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 +// CHECK14-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK14-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK14-NEXT: [[TMP19:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK14-NEXT: store i8* [[TMP19]], i8** [[TMP18]], align 4 +// CHECK14-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK14-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK14-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 +// CHECK14-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK14-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK14-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 +// CHECK14-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK14-NEXT: [[TMP25:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* +// CHECK14-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 +// CHECK14-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK14-NEXT: [[TMP28:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK14-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP27]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP28]], i32 4) +// CHECK14-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK14: omp.inner.for.inc: +// CHECK14-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP29]], [[TMP30]] +// CHECK14-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] +// CHECK14-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK14-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK14-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] +// CHECK14-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] +// CHECK14: cond.true10: +// CHECK14-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK14-NEXT: br label [[COND_END12:%.*]] +// CHECK14: cond.false11: +// CHECK14-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: br label [[COND_END12]] +// CHECK14: cond.end12: +// CHECK14-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP37]], [[COND_TRUE10]] ], [ [[TMP38]], [[COND_FALSE11]] ] +// CHECK14-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: store i32 [[TMP39]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK14: omp.inner.for.end: +// CHECK14-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK14: omp.loop.exit: +// CHECK14-NEXT: [[TMP40:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 +// CHECK14-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]]) +// CHECK14-NEXT: br label [[OMP_PRECOND_END]] +// CHECK14: omp.precond.end: +// CHECK14-NEXT: ret void +// CHECK14-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK14-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK14-NEXT: entry: +// CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK14-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK14-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK14-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK14-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK14-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK14-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK14-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK14-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK14-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK14-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK14-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK14-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK14-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK14-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK14-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK14: omp.precond.then: +// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK14-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK14-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK14-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK14-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK14-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK14-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK14-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK14-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK14-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK14-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK14: omp.inner.for.cond: +// CHECK14-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK14-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] +// CHECK14-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK14: omp.inner.for.body: +// CHECK14-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK14-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK14-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK14-NEXT: [[TMP13:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK14-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i32 0, i32 [[TMP13]] +// CHECK14-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +// CHECK14-NEXT: [[CONV:%.*]] = sext i16 [[TMP14]] to i32 +// CHECK14-NEXT: [[ADD5:%.*]] = add nsw i32 [[CONV]], 1 +// CHECK14-NEXT: [[CONV6:%.*]] = trunc i32 [[ADD5]] to i16 +// CHECK14-NEXT: store i16 [[CONV6]], i16* [[ARRAYIDX]], align 2 +// CHECK14-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK14: omp.body.continue: +// CHECK14-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK14: omp.inner.for.inc: +// CHECK14-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] +// CHECK14-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK14: omp.inner.for.end: +// CHECK14-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK14: omp.loop.exit: +// CHECK14-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 +// CHECK14-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) +// CHECK14-NEXT: br label [[OMP_PRECOND_END]] +// CHECK14: omp.precond.end: +// CHECK14-NEXT: ret void +// CHECK14-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 +// CHECK14-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK14-NEXT: entry: +// CHECK14-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK14-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK14-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK14-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK14-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK14-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK14-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK14: .execute: +// CHECK14-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK14-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK14-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] +// CHECK14-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK14: .omp.deinit: +// CHECK14-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK14-NEXT: br label [[DOTEXIT:%.*]] +// CHECK14: .exit: +// CHECK14-NEXT: ret void +// CHECK14-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK14-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK14-NEXT: entry: +// CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK14-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK14-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 +// CHECK14-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK14-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK14-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK14-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK14-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK14-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK14-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 +// CHECK14-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK14: cond.true: +// CHECK14-NEXT: br label [[COND_END:%.*]] +// CHECK14: cond.false: +// CHECK14-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: br label [[COND_END]] +// CHECK14: cond.end: +// CHECK14-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK14-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK14: omp.inner.for.cond: +// CHECK14-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 +// CHECK14-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK14: omp.inner.for.body: +// CHECK14-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK14-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK14-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK14-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK14-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK14-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK14-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK14-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK14-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK14-NEXT: [[TMP15:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK14-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP15]], i32 3) +// CHECK14-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK14: omp.inner.for.inc: +// CHECK14-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP17]] +// CHECK14-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] +// CHECK14-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK14-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP22]], 9 +// CHECK14-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] +// CHECK14: cond.true5: +// CHECK14-NEXT: br label [[COND_END7:%.*]] +// CHECK14: cond.false6: +// CHECK14-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: br label [[COND_END7]] +// CHECK14: cond.end7: +// CHECK14-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP23]], [[COND_FALSE6]] ] +// CHECK14-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: store i32 [[TMP24]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK14: omp.inner.for.end: +// CHECK14-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK14: omp.loop.exit: +// CHECK14-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK14-NEXT: ret void +// CHECK14-LABEL: define {{[^@]+}}@__omp_outlined__5 +// CHECK14-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK14-NEXT: entry: +// CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK14-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK14-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK14-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK14-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK14-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK14-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK14-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK14-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK14-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK14-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK14-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK14-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK14-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK14-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK14: omp.inner.for.cond: +// CHECK14-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK14-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK14-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK14: omp.inner.for.body: +// CHECK14-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 +// CHECK14-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK14-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK14-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 +// CHECK14-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 [[TMP9]] +// CHECK14-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK14-NEXT: [[ADD1:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK14-NEXT: store i32 [[ADD1]], i32* [[ARRAYIDX]], align 4 +// CHECK14-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK14: omp.body.continue: +// CHECK14-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK14: omp.inner.for.inc: +// CHECK14-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] +// CHECK14-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK14: omp.inner.for.end: +// CHECK14-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK14: omp.loop.exit: +// CHECK14-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK14-NEXT: ret void +// CHECK14-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59 +// CHECK14-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK14-NEXT: entry: +// CHECK14-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK14-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK14-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK14-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK14-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK14-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK14-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK14: .execute: +// CHECK14-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP2]], i32* [[F_CASTED]], align 4 +// CHECK14-NEXT: [[TMP3:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK14-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK14-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i32 [[TMP3]]) #[[ATTR2]] +// CHECK14-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK14: .omp.deinit: +// CHECK14-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK14-NEXT: br label [[DOTEXIT:%.*]] +// CHECK14: .exit: +// CHECK14-NEXT: ret void +// CHECK14-LABEL: define {{[^@]+}}@__omp_outlined__6 +// CHECK14-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK14-NEXT: entry: +// CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK14-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK14-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK14-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK14-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK14-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK14-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK14-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK14-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK14-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 +// CHECK14-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK14: cond.true: +// CHECK14-NEXT: br label [[COND_END:%.*]] +// CHECK14: cond.false: +// CHECK14-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: br label [[COND_END]] +// CHECK14: cond.end: +// CHECK14-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK14-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK14: omp.inner.for.cond: +// CHECK14-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 +// CHECK14-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK14: omp.inner.for.body: +// CHECK14-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP9:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP9]], i32* [[F_CASTED]], align 4 +// CHECK14-NEXT: [[TMP10:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK14-NEXT: [[TMP11:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK14-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK14-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK14-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK14-NEXT: [[TMP14:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK14-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK14-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK14-NEXT: [[TMP16:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK14-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 4 +// CHECK14-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK14-NEXT: [[TMP18:%.*]] = inttoptr i32 [[TMP10]] to i8* +// CHECK14-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 4 +// CHECK14-NEXT: [[TMP19:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK14-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x [10 x i32]]*, i32)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP19]], i32 4) +// CHECK14-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK14: omp.inner.for.inc: +// CHECK14-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK14-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK14-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK14-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP26]], 99 +// CHECK14-NEXT: br i1 [[CMP5]], label [[COND_TRUE6:%.*]], label [[COND_FALSE7:%.*]] +// CHECK14: cond.true6: +// CHECK14-NEXT: br label [[COND_END8:%.*]] +// CHECK14: cond.false7: +// CHECK14-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: br label [[COND_END8]] +// CHECK14: cond.end8: +// CHECK14-NEXT: [[COND9:%.*]] = phi i32 [ 99, [[COND_TRUE6]] ], [ [[TMP27]], [[COND_FALSE7]] ] +// CHECK14-NEXT: store i32 [[COND9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: store i32 [[TMP28]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK14: omp.inner.for.end: +// CHECK14-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK14: omp.loop.exit: +// CHECK14-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK14-NEXT: ret void +// CHECK14-LABEL: define {{[^@]+}}@__omp_outlined__7 +// CHECK14-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK14-NEXT: entry: +// CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK14-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK14-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK14-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK14-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK14-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK14-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK14-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 +// CHECK14-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK14-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK14-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK14-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK14-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK14-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK14-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK14: omp.inner.for.cond: +// CHECK14-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK14-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK14-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK14: omp.inner.for.body: +// CHECK14-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 +// CHECK14-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 +// CHECK14-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK14-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK14-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[DIV2:%.*]] = sdiv i32 [[TMP10]], 10 +// CHECK14-NEXT: [[MUL3:%.*]] = mul nsw i32 [[DIV2]], 10 +// CHECK14-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL3]] +// CHECK14-NEXT: [[MUL4:%.*]] = mul nsw i32 [[SUB]], 1 +// CHECK14-NEXT: [[ADD5:%.*]] = add nsw i32 0, [[MUL4]] +// CHECK14-NEXT: store i32 [[ADD5]], i32* [[J]], align 4 +// CHECK14-NEXT: store i32 10, i32* [[K]], align 4 +// CHECK14-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 +// CHECK14-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 +// CHECK14-NEXT: [[TMP13:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK14-NEXT: [[MUL6:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] +// CHECK14-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP11]], [[MUL6]] +// CHECK14-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 +// CHECK14-NEXT: [[ADD8:%.*]] = add nsw i32 [[ADD7]], [[TMP14]] +// CHECK14-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 +// CHECK14-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP15]] +// CHECK14-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 +// CHECK14-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP16]] +// CHECK14-NEXT: store i32 [[ADD8]], i32* [[ARRAYIDX9]], align 4 +// CHECK14-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK14: omp.body.continue: +// CHECK14-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK14: omp.inner.for.inc: +// CHECK14-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK14-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK14: omp.inner.for.end: +// CHECK14-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK14: omp.loop.exit: +// CHECK14-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK14-NEXT: ret void +// CHECK14-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l67 +// CHECK14-SAME: (i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK14-NEXT: entry: +// CHECK14-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK14-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK14-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK14-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK14-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK14-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK14: .execute: +// CHECK14-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK14-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK14-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK14-NEXT: call void @__omp_outlined__8(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [10 x [10 x i32]]* [[TMP0]]) #[[ATTR2]] +// CHECK14-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK14: .omp.deinit: +// CHECK14-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK14-NEXT: br label [[DOTEXIT:%.*]] +// CHECK14: .exit: +// CHECK14-NEXT: ret void +// CHECK14-LABEL: define {{[^@]+}}@__omp_outlined__8 +// CHECK14-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK14-NEXT: entry: +// CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK14-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK14-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 +// CHECK14-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 +// CHECK14-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i64, align 8 +// CHECK14-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i64, align 8 +// CHECK14-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 +// CHECK14-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[I9:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[J10:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK14-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK14-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK14-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK14-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK14-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK14-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 +// CHECK14-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK14-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64 +// CHECK14-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK14-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK14-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 +// CHECK14-NEXT: [[CONV6:%.*]] = sext i32 [[DIV5]] to i64 +// CHECK14-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV6]] +// CHECK14-NEXT: [[SUB7:%.*]] = sub nsw i64 [[MUL]], 1 +// CHECK14-NEXT: store i64 [[SUB7]], i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK14-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK14-NEXT: store i32 0, i32* [[J]], align 4 +// CHECK14-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK14-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK14-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK14: land.lhs.true: +// CHECK14-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK14-NEXT: [[CMP8:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK14-NEXT: br i1 [[CMP8]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] +// CHECK14: omp.precond.then: +// CHECK14-NEXT: store i64 0, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK14-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK14-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK14-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK14-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK14-NEXT: [[CONV11:%.*]] = zext i32 [[NVPTX_NUM_THREADS]] to i64 +// CHECK14-NEXT: [[TMP8:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4 +// CHECK14-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB1]], i32 [[TMP9]], i32 91, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_COMB_LB]], i64* [[DOTOMP_COMB_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 [[CONV11]]) +// CHECK14-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK14-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK14-NEXT: [[CMP12:%.*]] = icmp sgt i64 [[TMP10]], [[TMP11]] +// CHECK14-NEXT: br i1 [[CMP12]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK14: cond.true: +// CHECK14-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK14-NEXT: br label [[COND_END:%.*]] +// CHECK14: cond.false: +// CHECK14-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK14-NEXT: br label [[COND_END]] +// CHECK14: cond.end: +// CHECK14-NEXT: [[COND:%.*]] = phi i64 [ [[TMP12]], [[COND_TRUE]] ], [ [[TMP13]], [[COND_FALSE]] ] +// CHECK14-NEXT: store i64 [[COND]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK14-NEXT: [[TMP14:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK14-NEXT: store i64 [[TMP14]], i64* [[DOTOMP_IV]], align 8 +// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK14: omp.inner.for.cond: +// CHECK14-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK14-NEXT: [[TMP16:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK14-NEXT: [[ADD:%.*]] = add nsw i64 [[TMP16]], 1 +// CHECK14-NEXT: [[CMP13:%.*]] = icmp slt i64 [[TMP15]], [[ADD]] +// CHECK14-NEXT: br i1 [[CMP13]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK14: omp.inner.for.body: +// CHECK14-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK14-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32 +// CHECK14-NEXT: [[TMP19:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK14-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32 +// CHECK14-NEXT: [[TMP21:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP21]], i32* [[N_CASTED]], align 4 +// CHECK14-NEXT: [[TMP22:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK14-NEXT: [[TMP23:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK14-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP18]] to i8* +// CHECK14-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 +// CHECK14-NEXT: [[TMP25:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK14-NEXT: [[TMP26:%.*]] = inttoptr i32 [[TMP20]] to i8* +// CHECK14-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 +// CHECK14-NEXT: [[TMP27:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK14-NEXT: [[TMP28:%.*]] = inttoptr i32 [[TMP22]] to i8* +// CHECK14-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 +// CHECK14-NEXT: [[TMP29:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK14-NEXT: [[TMP30:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK14-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 4 +// CHECK14-NEXT: [[TMP31:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: [[TMP32:%.*]] = load i32, i32* [[TMP31]], align 4 +// CHECK14-NEXT: [[TMP33:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK14-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP32]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [10 x [10 x i32]]*)* @__omp_outlined__9 to i8*), i8* null, i8** [[TMP33]], i32 4) +// CHECK14-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK14: omp.inner.for.inc: +// CHECK14-NEXT: [[TMP34:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK14-NEXT: [[TMP35:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK14-NEXT: [[ADD14:%.*]] = add nsw i64 [[TMP34]], [[TMP35]] +// CHECK14-NEXT: store i64 [[ADD14]], i64* [[DOTOMP_IV]], align 8 +// CHECK14-NEXT: [[TMP36:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK14-NEXT: [[TMP37:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK14-NEXT: [[ADD15:%.*]] = add nsw i64 [[TMP36]], [[TMP37]] +// CHECK14-NEXT: store i64 [[ADD15]], i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK14-NEXT: [[TMP38:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK14-NEXT: [[TMP39:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK14-NEXT: [[ADD16:%.*]] = add nsw i64 [[TMP38]], [[TMP39]] +// CHECK14-NEXT: store i64 [[ADD16]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK14-NEXT: [[TMP40:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK14-NEXT: [[TMP41:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK14-NEXT: [[CMP17:%.*]] = icmp sgt i64 [[TMP40]], [[TMP41]] +// CHECK14-NEXT: br i1 [[CMP17]], label [[COND_TRUE18:%.*]], label [[COND_FALSE19:%.*]] +// CHECK14: cond.true18: +// CHECK14-NEXT: [[TMP42:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK14-NEXT: br label [[COND_END20:%.*]] +// CHECK14: cond.false19: +// CHECK14-NEXT: [[TMP43:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK14-NEXT: br label [[COND_END20]] +// CHECK14: cond.end20: +// CHECK14-NEXT: [[COND21:%.*]] = phi i64 [ [[TMP42]], [[COND_TRUE18]] ], [ [[TMP43]], [[COND_FALSE19]] ] +// CHECK14-NEXT: store i64 [[COND21]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK14-NEXT: [[TMP44:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK14-NEXT: store i64 [[TMP44]], i64* [[DOTOMP_IV]], align 8 +// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK14: omp.inner.for.end: +// CHECK14-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK14: omp.loop.exit: +// CHECK14-NEXT: [[TMP45:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: [[TMP46:%.*]] = load i32, i32* [[TMP45]], align 4 +// CHECK14-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP46]]) +// CHECK14-NEXT: br label [[OMP_PRECOND_END]] +// CHECK14: omp.precond.end: +// CHECK14-NEXT: ret void +// CHECK14-LABEL: define {{[^@]+}}@__omp_outlined__9 +// CHECK14-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK14-NEXT: entry: +// CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK14-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK14-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 +// CHECK14-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 +// CHECK14-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_LB:%.*]] = alloca i64, align 8 +// CHECK14-NEXT: [[DOTOMP_UB:%.*]] = alloca i64, align 8 +// CHECK14-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 +// CHECK14-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[I11:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[J12:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK14-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK14-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK14-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK14-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK14-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK14-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK14-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 +// CHECK14-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK14-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64 +// CHECK14-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK14-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK14-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 +// CHECK14-NEXT: [[CONV6:%.*]] = sext i32 [[DIV5]] to i64 +// CHECK14-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV6]] +// CHECK14-NEXT: [[SUB7:%.*]] = sub nsw i64 [[MUL]], 1 +// CHECK14-NEXT: store i64 [[SUB7]], i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK14-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK14-NEXT: store i32 0, i32* [[J]], align 4 +// CHECK14-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK14-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK14-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK14: land.lhs.true: +// CHECK14-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK14-NEXT: [[CMP8:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK14-NEXT: br i1 [[CMP8]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] +// CHECK14: omp.precond.then: +// CHECK14-NEXT: store i64 0, i64* [[DOTOMP_LB]], align 8 +// CHECK14-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK14-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_UB]], align 8 +// CHECK14-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK14-NEXT: [[CONV9:%.*]] = zext i32 [[TMP8]] to i64 +// CHECK14-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK14-NEXT: [[CONV10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK14-NEXT: store i64 [[CONV9]], i64* [[DOTOMP_LB]], align 8 +// CHECK14-NEXT: store i64 [[CONV10]], i64* [[DOTOMP_UB]], align 8 +// CHECK14-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK14-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK14-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB2]], i32 [[TMP11]], i32 33, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_LB]], i64* [[DOTOMP_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 1) +// CHECK14-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTOMP_LB]], align 8 +// CHECK14-NEXT: store i64 [[TMP12]], i64* [[DOTOMP_IV]], align 8 +// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK14: omp.inner.for.cond: +// CHECK14-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK14-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK14-NEXT: [[CONV13:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK14-NEXT: [[CMP14:%.*]] = icmp sle i64 [[TMP13]], [[CONV13]] +// CHECK14-NEXT: br i1 [[CMP14]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK14: omp.inner.for.body: +// CHECK14-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK14-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK14-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP16]], 0 +// CHECK14-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1 +// CHECK14-NEXT: [[MUL17:%.*]] = mul nsw i32 1, [[DIV16]] +// CHECK14-NEXT: [[CONV18:%.*]] = sext i32 [[MUL17]] to i64 +// CHECK14-NEXT: [[DIV19:%.*]] = sdiv i64 [[TMP15]], [[CONV18]] +// CHECK14-NEXT: [[MUL20:%.*]] = mul nsw i64 [[DIV19]], 1 +// CHECK14-NEXT: [[ADD:%.*]] = add nsw i64 0, [[MUL20]] +// CHECK14-NEXT: [[CONV21:%.*]] = trunc i64 [[ADD]] to i32 +// CHECK14-NEXT: store i32 [[CONV21]], i32* [[I11]], align 4 +// CHECK14-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK14-NEXT: [[TMP18:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK14-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK14-NEXT: [[SUB22:%.*]] = sub nsw i32 [[TMP19]], 0 +// CHECK14-NEXT: [[DIV23:%.*]] = sdiv i32 [[SUB22]], 1 +// CHECK14-NEXT: [[MUL24:%.*]] = mul nsw i32 1, [[DIV23]] +// CHECK14-NEXT: [[CONV25:%.*]] = sext i32 [[MUL24]] to i64 +// CHECK14-NEXT: [[DIV26:%.*]] = sdiv i64 [[TMP18]], [[CONV25]] +// CHECK14-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK14-NEXT: [[SUB27:%.*]] = sub nsw i32 [[TMP20]], 0 +// CHECK14-NEXT: [[DIV28:%.*]] = sdiv i32 [[SUB27]], 1 +// CHECK14-NEXT: [[MUL29:%.*]] = mul nsw i32 1, [[DIV28]] +// CHECK14-NEXT: [[CONV30:%.*]] = sext i32 [[MUL29]] to i64 +// CHECK14-NEXT: [[MUL31:%.*]] = mul nsw i64 [[DIV26]], [[CONV30]] +// CHECK14-NEXT: [[SUB32:%.*]] = sub nsw i64 [[TMP17]], [[MUL31]] +// CHECK14-NEXT: [[MUL33:%.*]] = mul nsw i64 [[SUB32]], 1 +// CHECK14-NEXT: [[ADD34:%.*]] = add nsw i64 0, [[MUL33]] +// CHECK14-NEXT: [[CONV35:%.*]] = trunc i64 [[ADD34]] to i32 +// CHECK14-NEXT: store i32 [[CONV35]], i32* [[J12]], align 4 +// CHECK14-NEXT: [[TMP21:%.*]] = load i32, i32* [[I11]], align 4 +// CHECK14-NEXT: [[TMP22:%.*]] = load i32, i32* [[J12]], align 4 +// CHECK14-NEXT: [[ADD36:%.*]] = add nsw i32 [[TMP21]], [[TMP22]] +// CHECK14-NEXT: [[TMP23:%.*]] = load i32, i32* [[I11]], align 4 +// CHECK14-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP23]] +// CHECK14-NEXT: [[TMP24:%.*]] = load i32, i32* [[J12]], align 4 +// CHECK14-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP24]] +// CHECK14-NEXT: store i32 [[ADD36]], i32* [[ARRAYIDX37]], align 4 +// CHECK14-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK14: omp.body.continue: +// CHECK14-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK14: omp.inner.for.inc: +// CHECK14-NEXT: [[TMP25:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK14-NEXT: [[TMP26:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK14-NEXT: [[ADD38:%.*]] = add nsw i64 [[TMP25]], [[TMP26]] +// CHECK14-NEXT: store i64 [[ADD38]], i64* [[DOTOMP_IV]], align 8 +// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK14: omp.inner.for.end: +// CHECK14-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK14: omp.loop.exit: +// CHECK14-NEXT: [[TMP27:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP27]], align 4 +// CHECK14-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP28]]) +// CHECK14-NEXT: br label [[OMP_PRECOND_END]] +// CHECK14: omp.precond.end: +// CHECK14-NEXT: ret void +// CHECK14-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74 +// CHECK14-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK14-NEXT: entry: +// CHECK14-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK14-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 +// CHECK14-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK14-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 +// CHECK14-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK14-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK14-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK14-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK14: .execute: +// CHECK14-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK14-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK14-NEXT: [[TMP4:%.*]] = load i32*, i32** [[V_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK14-NEXT: call void @__omp_outlined__10(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32* [[TMP4]]) #[[ATTR2]] +// CHECK14-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK14: .omp.deinit: +// CHECK14-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK14-NEXT: br label [[DOTEXIT:%.*]] +// CHECK14: .exit: +// CHECK14-NEXT: ret void +// CHECK14-LABEL: define {{[^@]+}}@__omp_outlined__10 +// CHECK14-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK14-NEXT: entry: +// CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK14-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK14-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 +// CHECK14-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 +// CHECK14-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK14-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK14-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 +// CHECK14-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK14-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK14-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK14-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK14-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK14-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK14-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK14-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK14-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK14-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK14: omp.precond.then: +// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK14-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK14-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK14-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK14-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK14-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK14-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK14-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK14: cond.true: +// CHECK14-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK14-NEXT: br label [[COND_END:%.*]] +// CHECK14: cond.false: +// CHECK14-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: br label [[COND_END]] +// CHECK14: cond.end: +// CHECK14-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK14-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK14: omp.inner.for.cond: +// CHECK14-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK14-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK14-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK14-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK14: omp.inner.for.body: +// CHECK14-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 +// CHECK14-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK14-NEXT: [[TMP18:%.*]] = load i32*, i32** [[V_ADDR]], align 4 +// CHECK14-NEXT: [[TMP19:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK14-NEXT: [[TMP20:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK14-NEXT: store i8* [[TMP20]], i8** [[TMP19]], align 4 +// CHECK14-NEXT: [[TMP21:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK14-NEXT: [[TMP22:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK14-NEXT: store i8* [[TMP22]], i8** [[TMP21]], align 4 +// CHECK14-NEXT: [[TMP23:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK14-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK14-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 +// CHECK14-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK14-NEXT: [[TMP26:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK14-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 +// CHECK14-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK14-NEXT: [[TMP28:%.*]] = bitcast i32* [[TMP18]] to i8* +// CHECK14-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 +// CHECK14-NEXT: [[TMP29:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: [[TMP30:%.*]] = load i32, i32* [[TMP29]], align 4 +// CHECK14-NEXT: [[TMP31:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK14-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP30]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32*)* @__omp_outlined__11 to i8*), i8* null, i8** [[TMP31]], i32 5) +// CHECK14-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK14: omp.inner.for.inc: +// CHECK14-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP32]], [[TMP33]] +// CHECK14-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP34]], [[TMP35]] +// CHECK14-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP36]], [[TMP37]] +// CHECK14-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK14-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP38]], [[TMP39]] +// CHECK14-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] +// CHECK14: cond.true10: +// CHECK14-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK14-NEXT: br label [[COND_END12:%.*]] +// CHECK14: cond.false11: +// CHECK14-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: br label [[COND_END12]] +// CHECK14: cond.end12: +// CHECK14-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP40]], [[COND_TRUE10]] ], [ [[TMP41]], [[COND_FALSE11]] ] +// CHECK14-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: store i32 [[TMP42]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK14: omp.inner.for.end: +// CHECK14-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK14: omp.loop.exit: +// CHECK14-NEXT: [[TMP43:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: [[TMP44:%.*]] = load i32, i32* [[TMP43]], align 4 +// CHECK14-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP44]]) +// CHECK14-NEXT: br label [[OMP_PRECOND_END]] +// CHECK14: omp.precond.end: +// CHECK14-NEXT: ret void +// CHECK14-LABEL: define {{[^@]+}}@__omp_outlined__11 +// CHECK14-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK14-NEXT: entry: +// CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK14-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK14-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 +// CHECK14-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK14-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK14-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK14-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK14-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 +// CHECK14-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK14-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK14-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK14-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK14-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK14-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK14-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK14-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK14-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK14-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK14: omp.precond.then: +// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK14-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK14-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK14-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK14-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK14-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK14-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK14-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK14-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK14-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK14-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK14: omp.inner.for.cond: +// CHECK14-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK14-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] +// CHECK14-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK14: omp.inner.for.body: +// CHECK14-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK14-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK14-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK14-NEXT: [[TMP13:%.*]] = load i32*, i32** [[V_ADDR]], align 4 +// CHECK14-NEXT: [[TMP14:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK14-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TMP13]], i32 [[TMP14]] +// CHECK14-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK14-NEXT: [[TMP16:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK14-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP16]] +// CHECK14-NEXT: store i32 [[TMP15]], i32* [[ARRAYIDX5]], align 4 +// CHECK14-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK14: omp.body.continue: +// CHECK14-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK14: omp.inner.for.inc: +// CHECK14-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK14-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK14: omp.inner.for.end: +// CHECK14-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK14: omp.loop.exit: +// CHECK14-NEXT: [[TMP19:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4 +// CHECK14-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP20]]) +// CHECK14-NEXT: br label [[OMP_PRECOND_END]] +// CHECK14: omp.precond.end: +// CHECK14-NEXT: ret void +// CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43 +// CHECK15-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK15-NEXT: entry: +// CHECK15-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK15-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK15-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK15-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK15-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK15-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK15: .execute: +// CHECK15-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK15-NEXT: [[TMP4:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP4]], i32* [[L_CASTED]], align 4 +// CHECK15-NEXT: [[TMP5:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK15-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32 [[TMP5]]) #[[ATTR2:[0-9]+]] +// CHECK15-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK15: .omp.deinit: +// CHECK15-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK15-NEXT: br label [[DOTEXIT:%.*]] +// CHECK15: .exit: +// CHECK15-NEXT: ret void +// CHECK15-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK15-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK15-NEXT: entry: +// CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK15-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[I4:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 +// CHECK15-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK15-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK15-NEXT: [[TMP1:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 4, i16 1) +// CHECK15-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to %struct._globalized_locals_ty* +// CHECK15-NEXT: [[L1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP2]], i32 0, i32 0 +// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP3]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK15-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK15-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK15-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK15-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK15-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK15-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK15-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK15-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK15-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK15: omp.precond.then: +// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK15-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK15-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK15-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP8]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) +// CHECK15-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK15-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]] +// CHECK15-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK15: cond.true: +// CHECK15-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK15-NEXT: br label [[COND_END:%.*]] +// CHECK15: cond.false: +// CHECK15-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: br label [[COND_END]] +// CHECK15: cond.end: +// CHECK15-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] +// CHECK15-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK15: omp.inner.for.cond: +// CHECK15-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK15-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP15]], 1 +// CHECK15-NEXT: [[CMP6:%.*]] = icmp slt i32 [[TMP14]], [[ADD]] +// CHECK15-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK15: omp.inner.for.body: +// CHECK15-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP18:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP18]], i32* [[N_CASTED]], align 4 +// CHECK15-NEXT: [[TMP19:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK15-NEXT: [[TMP20:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP20]], i32* [[L_CASTED]], align 4 +// CHECK15-NEXT: [[TMP21:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK15-NEXT: [[TMP22:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK15-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP16]] to i8* +// CHECK15-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 +// CHECK15-NEXT: [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK15-NEXT: [[TMP25:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK15-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 +// CHECK15-NEXT: [[TMP26:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK15-NEXT: [[TMP27:%.*]] = inttoptr i32 [[TMP19]] to i8* +// CHECK15-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 4 +// CHECK15-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK15-NEXT: [[TMP29:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK15-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 4 +// CHECK15-NEXT: [[TMP30:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK15-NEXT: [[TMP31:%.*]] = inttoptr i32 [[TMP21]] to i8* +// CHECK15-NEXT: store i8* [[TMP31]], i8** [[TMP30]], align 4 +// CHECK15-NEXT: [[TMP32:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: [[TMP33:%.*]] = load i32, i32* [[TMP32]], align 4 +// CHECK15-NEXT: [[TMP34:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK15-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP33]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP34]], i32 5) +// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK15: omp.inner.for.inc: +// CHECK15-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] +// CHECK15-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP37]], [[TMP38]] +// CHECK15-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP39]], [[TMP40]] +// CHECK15-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK15-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]] +// CHECK15-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK15: cond.true11: +// CHECK15-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK15-NEXT: br label [[COND_END13:%.*]] +// CHECK15: cond.false12: +// CHECK15-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: br label [[COND_END13]] +// CHECK15: cond.end13: +// CHECK15-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP43]], [[COND_TRUE11]] ], [ [[TMP44]], [[COND_FALSE12]] ] +// CHECK15-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: store i32 [[TMP45]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK15: omp.inner.for.end: +// CHECK15-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK15: omp.loop.exit: +// CHECK15-NEXT: [[TMP46:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: [[TMP47:%.*]] = load i32, i32* [[TMP46]], align 4 +// CHECK15-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP47]]) +// CHECK15-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK15-NEXT: [[TMP49:%.*]] = icmp ne i32 [[TMP48]], 0 +// CHECK15-NEXT: br i1 [[TMP49]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK15: .omp.lastprivate.then: +// CHECK15-NEXT: [[TMP50:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP50]], i32* [[L_ADDR]], align 4 +// CHECK15-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK15: .omp.lastprivate.done: +// CHECK15-NEXT: br label [[OMP_PRECOND_END]] +// CHECK15: omp.precond.end: +// CHECK15-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP1]]) +// CHECK15-NEXT: ret void +// CHECK15-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK15-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK15-NEXT: entry: +// CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK15-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK15-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK15-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK15-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK15-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK15-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK15-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK15-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK15-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK15-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK15: omp.precond.then: +// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK15-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK15-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK15-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK15-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK15-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK15-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK15-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK15-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) +// CHECK15-NEXT: br label [[OMP_DISPATCH_COND:%.*]] +// CHECK15: omp.dispatch.cond: +// CHECK15-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK15-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK15-NEXT: [[CMP4:%.*]] = icmp ugt i32 [[TMP9]], [[TMP10]] +// CHECK15-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK15: cond.true: +// CHECK15-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK15-NEXT: br label [[COND_END:%.*]] +// CHECK15: cond.false: +// CHECK15-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK15-NEXT: br label [[COND_END]] +// CHECK15: cond.end: +// CHECK15-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] +// CHECK15-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 +// CHECK15-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK15-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK15-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] +// CHECK15-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] +// CHECK15: omp.dispatch.body: +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK15: omp.inner.for.cond: +// CHECK15-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK15-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] +// CHECK15-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK15: omp.inner.for.body: +// CHECK15-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK15-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK15-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK15-NEXT: [[TMP19:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK15-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP19]] +// CHECK15-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 +// CHECK15-NEXT: [[TMP20:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK15-NEXT: store i32 [[TMP20]], i32* [[L_ADDR]], align 4 +// CHECK15-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK15: omp.body.continue: +// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK15: omp.inner.for.inc: +// CHECK15-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP21]], 1 +// CHECK15-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK15: omp.inner.for.end: +// CHECK15-NEXT: br label [[OMP_DISPATCH_INC:%.*]] +// CHECK15: omp.dispatch.inc: +// CHECK15-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK15-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK15-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_LB]], align 4 +// CHECK15-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK15-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK15-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_UB]], align 4 +// CHECK15-NEXT: br label [[OMP_DISPATCH_COND]] +// CHECK15: omp.dispatch.end: +// CHECK15-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK15-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) +// CHECK15-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK15-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 +// CHECK15-NEXT: br i1 [[TMP29]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK15: .omp.lastprivate.then: +// CHECK15-NEXT: [[TMP30:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP30]], i32* [[L_ADDR]], align 4 +// CHECK15-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK15: .omp.lastprivate.done: +// CHECK15-NEXT: br label [[OMP_PRECOND_END]] +// CHECK15: omp.precond.end: +// CHECK15-NEXT: ret void +// CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 +// CHECK15-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK15-NEXT: entry: +// CHECK15-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK15-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK15-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK15-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK15-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK15-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK15: .execute: +// CHECK15-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK15-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] +// CHECK15-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK15: .omp.deinit: +// CHECK15-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK15-NEXT: br label [[DOTEXIT:%.*]] +// CHECK15: .exit: +// CHECK15-NEXT: ret void +// CHECK15-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK15-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK15-NEXT: entry: +// CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK15-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK15-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK15-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK15-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK15-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK15-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK15-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK15-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK15-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK15-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK15-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK15: omp.precond.then: +// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK15-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK15-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK15-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK15-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK15-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK15-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK15-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK15: cond.true: +// CHECK15-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK15-NEXT: br label [[COND_END:%.*]] +// CHECK15: cond.false: +// CHECK15-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: br label [[COND_END]] +// CHECK15: cond.end: +// CHECK15-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK15-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK15: omp.inner.for.cond: +// CHECK15-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK15-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK15-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK15-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK15: omp.inner.for.body: +// CHECK15-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 +// CHECK15-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK15-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK15-NEXT: [[TMP19:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK15-NEXT: store i8* [[TMP19]], i8** [[TMP18]], align 4 +// CHECK15-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK15-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK15-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 +// CHECK15-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK15-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK15-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 +// CHECK15-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK15-NEXT: [[TMP25:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* +// CHECK15-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 +// CHECK15-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK15-NEXT: [[TMP28:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK15-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP27]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP28]], i32 4) +// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK15: omp.inner.for.inc: +// CHECK15-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP29]], [[TMP30]] +// CHECK15-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] +// CHECK15-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK15-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK15-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] +// CHECK15-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] +// CHECK15: cond.true10: +// CHECK15-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK15-NEXT: br label [[COND_END12:%.*]] +// CHECK15: cond.false11: +// CHECK15-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: br label [[COND_END12]] +// CHECK15: cond.end12: +// CHECK15-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP37]], [[COND_TRUE10]] ], [ [[TMP38]], [[COND_FALSE11]] ] +// CHECK15-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: store i32 [[TMP39]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK15: omp.inner.for.end: +// CHECK15-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK15: omp.loop.exit: +// CHECK15-NEXT: [[TMP40:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 +// CHECK15-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]]) +// CHECK15-NEXT: br label [[OMP_PRECOND_END]] +// CHECK15: omp.precond.end: +// CHECK15-NEXT: ret void +// CHECK15-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK15-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK15-NEXT: entry: +// CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK15-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK15-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK15-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK15-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK15-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK15-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK15-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK15-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK15-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK15-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK15: omp.precond.then: +// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK15-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK15-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK15-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK15-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK15-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK15-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK15-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK15-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK15-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK15-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK15: omp.inner.for.cond: +// CHECK15-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK15-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] +// CHECK15-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK15: omp.inner.for.body: +// CHECK15-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK15-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK15-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK15-NEXT: [[TMP13:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK15-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i32 0, i32 [[TMP13]] +// CHECK15-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +// CHECK15-NEXT: [[CONV:%.*]] = sext i16 [[TMP14]] to i32 +// CHECK15-NEXT: [[ADD5:%.*]] = add nsw i32 [[CONV]], 1 +// CHECK15-NEXT: [[CONV6:%.*]] = trunc i32 [[ADD5]] to i16 +// CHECK15-NEXT: store i16 [[CONV6]], i16* [[ARRAYIDX]], align 2 +// CHECK15-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK15: omp.body.continue: +// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK15: omp.inner.for.inc: +// CHECK15-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] +// CHECK15-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK15: omp.inner.for.end: +// CHECK15-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK15: omp.loop.exit: +// CHECK15-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 +// CHECK15-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) +// CHECK15-NEXT: br label [[OMP_PRECOND_END]] +// CHECK15: omp.precond.end: +// CHECK15-NEXT: ret void +// CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 +// CHECK15-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK15-NEXT: entry: +// CHECK15-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK15-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK15-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK15-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK15-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK15-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK15-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK15: .execute: +// CHECK15-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK15-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] +// CHECK15-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK15: .omp.deinit: +// CHECK15-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK15-NEXT: br label [[DOTEXIT:%.*]] +// CHECK15: .exit: +// CHECK15-NEXT: ret void +// CHECK15-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK15-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK15-NEXT: entry: +// CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK15-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 +// CHECK15-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK15-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK15-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK15-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK15-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK15-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 +// CHECK15-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK15: cond.true: +// CHECK15-NEXT: br label [[COND_END:%.*]] +// CHECK15: cond.false: +// CHECK15-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: br label [[COND_END]] +// CHECK15: cond.end: +// CHECK15-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK15-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK15: omp.inner.for.cond: +// CHECK15-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 +// CHECK15-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK15: omp.inner.for.body: +// CHECK15-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK15-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK15-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK15-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK15-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK15-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK15-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK15-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK15-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK15-NEXT: [[TMP15:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK15-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP15]], i32 3) +// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK15: omp.inner.for.inc: +// CHECK15-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP17]] +// CHECK15-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] +// CHECK15-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK15-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP22]], 9 +// CHECK15-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] +// CHECK15: cond.true5: +// CHECK15-NEXT: br label [[COND_END7:%.*]] +// CHECK15: cond.false6: +// CHECK15-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: br label [[COND_END7]] +// CHECK15: cond.end7: +// CHECK15-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP23]], [[COND_FALSE6]] ] +// CHECK15-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: store i32 [[TMP24]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK15: omp.inner.for.end: +// CHECK15-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK15: omp.loop.exit: +// CHECK15-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK15-NEXT: ret void +// CHECK15-LABEL: define {{[^@]+}}@__omp_outlined__5 +// CHECK15-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK15-NEXT: entry: +// CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK15-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK15-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK15-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK15-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK15-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK15-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK15-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK15-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK15-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK15-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK15-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK15: omp.inner.for.cond: +// CHECK15-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK15-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK15-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK15: omp.inner.for.body: +// CHECK15-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 +// CHECK15-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK15-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK15-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 +// CHECK15-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 [[TMP9]] +// CHECK15-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK15-NEXT: [[ADD1:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK15-NEXT: store i32 [[ADD1]], i32* [[ARRAYIDX]], align 4 +// CHECK15-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK15: omp.body.continue: +// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK15: omp.inner.for.inc: +// CHECK15-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] +// CHECK15-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK15: omp.inner.for.end: +// CHECK15-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK15: omp.loop.exit: +// CHECK15-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK15-NEXT: ret void +// CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59 +// CHECK15-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK15-NEXT: entry: +// CHECK15-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK15-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK15-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK15-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK15-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK15-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK15-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK15: .execute: +// CHECK15-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP2]], i32* [[F_CASTED]], align 4 +// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK15-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i32 [[TMP3]]) #[[ATTR2]] +// CHECK15-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK15: .omp.deinit: +// CHECK15-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK15-NEXT: br label [[DOTEXIT:%.*]] +// CHECK15: .exit: +// CHECK15-NEXT: ret void +// CHECK15-LABEL: define {{[^@]+}}@__omp_outlined__6 +// CHECK15-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK15-NEXT: entry: +// CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK15-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK15-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK15-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK15-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK15-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK15-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK15-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 +// CHECK15-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK15: cond.true: +// CHECK15-NEXT: br label [[COND_END:%.*]] +// CHECK15: cond.false: +// CHECK15-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: br label [[COND_END]] +// CHECK15: cond.end: +// CHECK15-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK15-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK15: omp.inner.for.cond: +// CHECK15-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 +// CHECK15-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK15: omp.inner.for.body: +// CHECK15-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP9:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP9]], i32* [[F_CASTED]], align 4 +// CHECK15-NEXT: [[TMP10:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK15-NEXT: [[TMP11:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK15-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK15-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK15-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK15-NEXT: [[TMP14:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK15-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK15-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK15-NEXT: [[TMP16:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK15-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 4 +// CHECK15-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK15-NEXT: [[TMP18:%.*]] = inttoptr i32 [[TMP10]] to i8* +// CHECK15-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 4 +// CHECK15-NEXT: [[TMP19:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK15-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x [10 x i32]]*, i32)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP19]], i32 4) +// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK15: omp.inner.for.inc: +// CHECK15-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK15-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK15-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK15-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP26]], 99 +// CHECK15-NEXT: br i1 [[CMP5]], label [[COND_TRUE6:%.*]], label [[COND_FALSE7:%.*]] +// CHECK15: cond.true6: +// CHECK15-NEXT: br label [[COND_END8:%.*]] +// CHECK15: cond.false7: +// CHECK15-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: br label [[COND_END8]] +// CHECK15: cond.end8: +// CHECK15-NEXT: [[COND9:%.*]] = phi i32 [ 99, [[COND_TRUE6]] ], [ [[TMP27]], [[COND_FALSE7]] ] +// CHECK15-NEXT: store i32 [[COND9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: store i32 [[TMP28]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK15: omp.inner.for.end: +// CHECK15-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK15: omp.loop.exit: +// CHECK15-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK15-NEXT: ret void +// CHECK15-LABEL: define {{[^@]+}}@__omp_outlined__7 +// CHECK15-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK15-NEXT: entry: +// CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK15-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK15-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK15-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK15-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 +// CHECK15-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK15-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK15-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK15-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK15-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK15-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK15-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK15: omp.inner.for.cond: +// CHECK15-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK15-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK15-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK15: omp.inner.for.body: +// CHECK15-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 +// CHECK15-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 +// CHECK15-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK15-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK15-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[DIV2:%.*]] = sdiv i32 [[TMP10]], 10 +// CHECK15-NEXT: [[MUL3:%.*]] = mul nsw i32 [[DIV2]], 10 +// CHECK15-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL3]] +// CHECK15-NEXT: [[MUL4:%.*]] = mul nsw i32 [[SUB]], 1 +// CHECK15-NEXT: [[ADD5:%.*]] = add nsw i32 0, [[MUL4]] +// CHECK15-NEXT: store i32 [[ADD5]], i32* [[J]], align 4 +// CHECK15-NEXT: store i32 10, i32* [[K]], align 4 +// CHECK15-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 +// CHECK15-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 +// CHECK15-NEXT: [[TMP13:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK15-NEXT: [[MUL6:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] +// CHECK15-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP11]], [[MUL6]] +// CHECK15-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 +// CHECK15-NEXT: [[ADD8:%.*]] = add nsw i32 [[ADD7]], [[TMP14]] +// CHECK15-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 +// CHECK15-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP15]] +// CHECK15-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 +// CHECK15-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP16]] +// CHECK15-NEXT: store i32 [[ADD8]], i32* [[ARRAYIDX9]], align 4 +// CHECK15-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK15: omp.body.continue: +// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK15: omp.inner.for.inc: +// CHECK15-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK15-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK15: omp.inner.for.end: +// CHECK15-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK15: omp.loop.exit: +// CHECK15-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK15-NEXT: ret void +// CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l67 +// CHECK15-SAME: (i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK15-NEXT: entry: +// CHECK15-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK15-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK15-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK15-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK15-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK15-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK15: .execute: +// CHECK15-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK15-NEXT: call void @__omp_outlined__8(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [10 x [10 x i32]]* [[TMP0]]) #[[ATTR2]] +// CHECK15-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK15: .omp.deinit: +// CHECK15-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK15-NEXT: br label [[DOTEXIT:%.*]] +// CHECK15: .exit: +// CHECK15-NEXT: ret void +// CHECK15-LABEL: define {{[^@]+}}@__omp_outlined__8 +// CHECK15-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK15-NEXT: entry: +// CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK15-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 +// CHECK15-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 +// CHECK15-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i64, align 8 +// CHECK15-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i64, align 8 +// CHECK15-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 +// CHECK15-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[I9:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[J10:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK15-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK15-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK15-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK15-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 +// CHECK15-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK15-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64 +// CHECK15-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK15-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK15-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 +// CHECK15-NEXT: [[CONV6:%.*]] = sext i32 [[DIV5]] to i64 +// CHECK15-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV6]] +// CHECK15-NEXT: [[SUB7:%.*]] = sub nsw i64 [[MUL]], 1 +// CHECK15-NEXT: store i64 [[SUB7]], i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK15-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK15-NEXT: store i32 0, i32* [[J]], align 4 +// CHECK15-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK15-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK15-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK15: land.lhs.true: +// CHECK15-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK15-NEXT: [[CMP8:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK15-NEXT: br i1 [[CMP8]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] +// CHECK15: omp.precond.then: +// CHECK15-NEXT: store i64 0, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK15-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK15-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK15-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK15-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK15-NEXT: [[CONV11:%.*]] = zext i32 [[NVPTX_NUM_THREADS]] to i64 +// CHECK15-NEXT: [[TMP8:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4 +// CHECK15-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB1]], i32 [[TMP9]], i32 91, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_COMB_LB]], i64* [[DOTOMP_COMB_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 [[CONV11]]) +// CHECK15-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK15-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK15-NEXT: [[CMP12:%.*]] = icmp sgt i64 [[TMP10]], [[TMP11]] +// CHECK15-NEXT: br i1 [[CMP12]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK15: cond.true: +// CHECK15-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK15-NEXT: br label [[COND_END:%.*]] +// CHECK15: cond.false: +// CHECK15-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK15-NEXT: br label [[COND_END]] +// CHECK15: cond.end: +// CHECK15-NEXT: [[COND:%.*]] = phi i64 [ [[TMP12]], [[COND_TRUE]] ], [ [[TMP13]], [[COND_FALSE]] ] +// CHECK15-NEXT: store i64 [[COND]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK15-NEXT: [[TMP14:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK15-NEXT: store i64 [[TMP14]], i64* [[DOTOMP_IV]], align 8 +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK15: omp.inner.for.cond: +// CHECK15-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK15-NEXT: [[TMP16:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK15-NEXT: [[ADD:%.*]] = add nsw i64 [[TMP16]], 1 +// CHECK15-NEXT: [[CMP13:%.*]] = icmp slt i64 [[TMP15]], [[ADD]] +// CHECK15-NEXT: br i1 [[CMP13]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK15: omp.inner.for.body: +// CHECK15-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK15-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32 +// CHECK15-NEXT: [[TMP19:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK15-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32 +// CHECK15-NEXT: [[TMP21:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP21]], i32* [[N_CASTED]], align 4 +// CHECK15-NEXT: [[TMP22:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK15-NEXT: [[TMP23:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK15-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP18]] to i8* +// CHECK15-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 +// CHECK15-NEXT: [[TMP25:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK15-NEXT: [[TMP26:%.*]] = inttoptr i32 [[TMP20]] to i8* +// CHECK15-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 +// CHECK15-NEXT: [[TMP27:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK15-NEXT: [[TMP28:%.*]] = inttoptr i32 [[TMP22]] to i8* +// CHECK15-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 +// CHECK15-NEXT: [[TMP29:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK15-NEXT: [[TMP30:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK15-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 4 +// CHECK15-NEXT: [[TMP31:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: [[TMP32:%.*]] = load i32, i32* [[TMP31]], align 4 +// CHECK15-NEXT: [[TMP33:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK15-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP32]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [10 x [10 x i32]]*)* @__omp_outlined__9 to i8*), i8* null, i8** [[TMP33]], i32 4) +// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK15: omp.inner.for.inc: +// CHECK15-NEXT: [[TMP34:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK15-NEXT: [[TMP35:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK15-NEXT: [[ADD14:%.*]] = add nsw i64 [[TMP34]], [[TMP35]] +// CHECK15-NEXT: store i64 [[ADD14]], i64* [[DOTOMP_IV]], align 8 +// CHECK15-NEXT: [[TMP36:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK15-NEXT: [[TMP37:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK15-NEXT: [[ADD15:%.*]] = add nsw i64 [[TMP36]], [[TMP37]] +// CHECK15-NEXT: store i64 [[ADD15]], i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK15-NEXT: [[TMP38:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK15-NEXT: [[TMP39:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK15-NEXT: [[ADD16:%.*]] = add nsw i64 [[TMP38]], [[TMP39]] +// CHECK15-NEXT: store i64 [[ADD16]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK15-NEXT: [[TMP40:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK15-NEXT: [[TMP41:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK15-NEXT: [[CMP17:%.*]] = icmp sgt i64 [[TMP40]], [[TMP41]] +// CHECK15-NEXT: br i1 [[CMP17]], label [[COND_TRUE18:%.*]], label [[COND_FALSE19:%.*]] +// CHECK15: cond.true18: +// CHECK15-NEXT: [[TMP42:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK15-NEXT: br label [[COND_END20:%.*]] +// CHECK15: cond.false19: +// CHECK15-NEXT: [[TMP43:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK15-NEXT: br label [[COND_END20]] +// CHECK15: cond.end20: +// CHECK15-NEXT: [[COND21:%.*]] = phi i64 [ [[TMP42]], [[COND_TRUE18]] ], [ [[TMP43]], [[COND_FALSE19]] ] +// CHECK15-NEXT: store i64 [[COND21]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK15-NEXT: [[TMP44:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK15-NEXT: store i64 [[TMP44]], i64* [[DOTOMP_IV]], align 8 +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK15: omp.inner.for.end: +// CHECK15-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK15: omp.loop.exit: +// CHECK15-NEXT: [[TMP45:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: [[TMP46:%.*]] = load i32, i32* [[TMP45]], align 4 +// CHECK15-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP46]]) +// CHECK15-NEXT: br label [[OMP_PRECOND_END]] +// CHECK15: omp.precond.end: +// CHECK15-NEXT: ret void +// CHECK15-LABEL: define {{[^@]+}}@__omp_outlined__9 +// CHECK15-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK15-NEXT: entry: +// CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK15-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 +// CHECK15-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 +// CHECK15-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_LB:%.*]] = alloca i64, align 8 +// CHECK15-NEXT: [[DOTOMP_UB:%.*]] = alloca i64, align 8 +// CHECK15-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 +// CHECK15-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[I11:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[J12:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK15-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK15-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK15-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 +// CHECK15-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK15-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64 +// CHECK15-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK15-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK15-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 +// CHECK15-NEXT: [[CONV6:%.*]] = sext i32 [[DIV5]] to i64 +// CHECK15-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV6]] +// CHECK15-NEXT: [[SUB7:%.*]] = sub nsw i64 [[MUL]], 1 +// CHECK15-NEXT: store i64 [[SUB7]], i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK15-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK15-NEXT: store i32 0, i32* [[J]], align 4 +// CHECK15-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK15-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK15-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK15: land.lhs.true: +// CHECK15-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK15-NEXT: [[CMP8:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK15-NEXT: br i1 [[CMP8]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] +// CHECK15: omp.precond.then: +// CHECK15-NEXT: store i64 0, i64* [[DOTOMP_LB]], align 8 +// CHECK15-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK15-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_UB]], align 8 +// CHECK15-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK15-NEXT: [[CONV9:%.*]] = zext i32 [[TMP8]] to i64 +// CHECK15-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK15-NEXT: [[CONV10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK15-NEXT: store i64 [[CONV9]], i64* [[DOTOMP_LB]], align 8 +// CHECK15-NEXT: store i64 [[CONV10]], i64* [[DOTOMP_UB]], align 8 +// CHECK15-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK15-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK15-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB2]], i32 [[TMP11]], i32 33, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_LB]], i64* [[DOTOMP_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 1) +// CHECK15-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTOMP_LB]], align 8 +// CHECK15-NEXT: store i64 [[TMP12]], i64* [[DOTOMP_IV]], align 8 +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK15: omp.inner.for.cond: +// CHECK15-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK15-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK15-NEXT: [[CONV13:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK15-NEXT: [[CMP14:%.*]] = icmp sle i64 [[TMP13]], [[CONV13]] +// CHECK15-NEXT: br i1 [[CMP14]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK15: omp.inner.for.body: +// CHECK15-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK15-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK15-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP16]], 0 +// CHECK15-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1 +// CHECK15-NEXT: [[MUL17:%.*]] = mul nsw i32 1, [[DIV16]] +// CHECK15-NEXT: [[CONV18:%.*]] = sext i32 [[MUL17]] to i64 +// CHECK15-NEXT: [[DIV19:%.*]] = sdiv i64 [[TMP15]], [[CONV18]] +// CHECK15-NEXT: [[MUL20:%.*]] = mul nsw i64 [[DIV19]], 1 +// CHECK15-NEXT: [[ADD:%.*]] = add nsw i64 0, [[MUL20]] +// CHECK15-NEXT: [[CONV21:%.*]] = trunc i64 [[ADD]] to i32 +// CHECK15-NEXT: store i32 [[CONV21]], i32* [[I11]], align 4 +// CHECK15-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK15-NEXT: [[TMP18:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK15-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK15-NEXT: [[SUB22:%.*]] = sub nsw i32 [[TMP19]], 0 +// CHECK15-NEXT: [[DIV23:%.*]] = sdiv i32 [[SUB22]], 1 +// CHECK15-NEXT: [[MUL24:%.*]] = mul nsw i32 1, [[DIV23]] +// CHECK15-NEXT: [[CONV25:%.*]] = sext i32 [[MUL24]] to i64 +// CHECK15-NEXT: [[DIV26:%.*]] = sdiv i64 [[TMP18]], [[CONV25]] +// CHECK15-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK15-NEXT: [[SUB27:%.*]] = sub nsw i32 [[TMP20]], 0 +// CHECK15-NEXT: [[DIV28:%.*]] = sdiv i32 [[SUB27]], 1 +// CHECK15-NEXT: [[MUL29:%.*]] = mul nsw i32 1, [[DIV28]] +// CHECK15-NEXT: [[CONV30:%.*]] = sext i32 [[MUL29]] to i64 +// CHECK15-NEXT: [[MUL31:%.*]] = mul nsw i64 [[DIV26]], [[CONV30]] +// CHECK15-NEXT: [[SUB32:%.*]] = sub nsw i64 [[TMP17]], [[MUL31]] +// CHECK15-NEXT: [[MUL33:%.*]] = mul nsw i64 [[SUB32]], 1 +// CHECK15-NEXT: [[ADD34:%.*]] = add nsw i64 0, [[MUL33]] +// CHECK15-NEXT: [[CONV35:%.*]] = trunc i64 [[ADD34]] to i32 +// CHECK15-NEXT: store i32 [[CONV35]], i32* [[J12]], align 4 +// CHECK15-NEXT: [[TMP21:%.*]] = load i32, i32* [[I11]], align 4 +// CHECK15-NEXT: [[TMP22:%.*]] = load i32, i32* [[J12]], align 4 +// CHECK15-NEXT: [[ADD36:%.*]] = add nsw i32 [[TMP21]], [[TMP22]] +// CHECK15-NEXT: [[TMP23:%.*]] = load i32, i32* [[I11]], align 4 +// CHECK15-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP23]] +// CHECK15-NEXT: [[TMP24:%.*]] = load i32, i32* [[J12]], align 4 +// CHECK15-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP24]] +// CHECK15-NEXT: store i32 [[ADD36]], i32* [[ARRAYIDX37]], align 4 +// CHECK15-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK15: omp.body.continue: +// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK15: omp.inner.for.inc: +// CHECK15-NEXT: [[TMP25:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK15-NEXT: [[TMP26:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK15-NEXT: [[ADD38:%.*]] = add nsw i64 [[TMP25]], [[TMP26]] +// CHECK15-NEXT: store i64 [[ADD38]], i64* [[DOTOMP_IV]], align 8 +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK15: omp.inner.for.end: +// CHECK15-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK15: omp.loop.exit: +// CHECK15-NEXT: [[TMP27:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP27]], align 4 +// CHECK15-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP28]]) +// CHECK15-NEXT: br label [[OMP_PRECOND_END]] +// CHECK15: omp.precond.end: +// CHECK15-NEXT: ret void +// CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74 +// CHECK15-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK15-NEXT: entry: +// CHECK15-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK15-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK15-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 +// CHECK15-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK15-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK15-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK15-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK15: .execute: +// CHECK15-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK15-NEXT: [[TMP4:%.*]] = load i32*, i32** [[V_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK15-NEXT: call void @__omp_outlined__10(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32* [[TMP4]]) #[[ATTR2]] +// CHECK15-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK15: .omp.deinit: +// CHECK15-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK15-NEXT: br label [[DOTEXIT:%.*]] +// CHECK15: .exit: +// CHECK15-NEXT: ret void +// CHECK15-LABEL: define {{[^@]+}}@__omp_outlined__10 +// CHECK15-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK15-NEXT: entry: +// CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK15-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 +// CHECK15-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK15-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 +// CHECK15-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK15-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK15-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK15-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK15-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK15-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK15-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK15-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK15-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK15: omp.precond.then: +// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK15-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK15-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK15-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK15-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK15-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK15-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK15-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK15: cond.true: +// CHECK15-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK15-NEXT: br label [[COND_END:%.*]] +// CHECK15: cond.false: +// CHECK15-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: br label [[COND_END]] +// CHECK15: cond.end: +// CHECK15-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK15-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK15: omp.inner.for.cond: +// CHECK15-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK15-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK15-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK15-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK15: omp.inner.for.body: +// CHECK15-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 +// CHECK15-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK15-NEXT: [[TMP18:%.*]] = load i32*, i32** [[V_ADDR]], align 4 +// CHECK15-NEXT: [[TMP19:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK15-NEXT: [[TMP20:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK15-NEXT: store i8* [[TMP20]], i8** [[TMP19]], align 4 +// CHECK15-NEXT: [[TMP21:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK15-NEXT: [[TMP22:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK15-NEXT: store i8* [[TMP22]], i8** [[TMP21]], align 4 +// CHECK15-NEXT: [[TMP23:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK15-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK15-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 +// CHECK15-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK15-NEXT: [[TMP26:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK15-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 +// CHECK15-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK15-NEXT: [[TMP28:%.*]] = bitcast i32* [[TMP18]] to i8* +// CHECK15-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 +// CHECK15-NEXT: [[TMP29:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: [[TMP30:%.*]] = load i32, i32* [[TMP29]], align 4 +// CHECK15-NEXT: [[TMP31:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK15-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP30]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32*)* @__omp_outlined__11 to i8*), i8* null, i8** [[TMP31]], i32 5) +// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK15: omp.inner.for.inc: +// CHECK15-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP32]], [[TMP33]] +// CHECK15-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP34]], [[TMP35]] +// CHECK15-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP36]], [[TMP37]] +// CHECK15-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK15-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP38]], [[TMP39]] +// CHECK15-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] +// CHECK15: cond.true10: +// CHECK15-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK15-NEXT: br label [[COND_END12:%.*]] +// CHECK15: cond.false11: +// CHECK15-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: br label [[COND_END12]] +// CHECK15: cond.end12: +// CHECK15-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP40]], [[COND_TRUE10]] ], [ [[TMP41]], [[COND_FALSE11]] ] +// CHECK15-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: store i32 [[TMP42]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK15: omp.inner.for.end: +// CHECK15-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK15: omp.loop.exit: +// CHECK15-NEXT: [[TMP43:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: [[TMP44:%.*]] = load i32, i32* [[TMP43]], align 4 +// CHECK15-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP44]]) +// CHECK15-NEXT: br label [[OMP_PRECOND_END]] +// CHECK15: omp.precond.end: +// CHECK15-NEXT: ret void +// CHECK15-LABEL: define {{[^@]+}}@__omp_outlined__11 +// CHECK15-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK15-NEXT: entry: +// CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK15-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK15-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 +// CHECK15-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK15-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK15-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK15-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK15-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK15-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK15-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK15-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK15-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK15: omp.precond.then: +// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK15-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK15-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK15-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK15-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK15-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK15-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK15-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK15-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK15-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK15-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK15: omp.inner.for.cond: +// CHECK15-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK15-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] +// CHECK15-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK15: omp.inner.for.body: +// CHECK15-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK15-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK15-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK15-NEXT: [[TMP13:%.*]] = load i32*, i32** [[V_ADDR]], align 4 +// CHECK15-NEXT: [[TMP14:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK15-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TMP13]], i32 [[TMP14]] +// CHECK15-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK15-NEXT: [[TMP16:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK15-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP16]] +// CHECK15-NEXT: store i32 [[TMP15]], i32* [[ARRAYIDX5]], align 4 +// CHECK15-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK15: omp.body.continue: +// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK15: omp.inner.for.inc: +// CHECK15-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK15-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK15: omp.inner.for.end: +// CHECK15-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK15: omp.loop.exit: +// CHECK15-NEXT: [[TMP19:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4 +// CHECK15-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP20]]) +// CHECK15-NEXT: br label [[OMP_PRECOND_END]] +// CHECK15: omp.precond.end: +// CHECK15-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l35 +// CHECK5-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK5-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 +// CHECK5-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK5-NEXT: [[L_CASTED:%.*]] = alloca i64, align 8 +// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK5-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK5-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK5-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 +// CHECK5-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK5-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK5-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14:![0-9]+]] +// CHECK5-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK5-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK5: .execute: +// CHECK5-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK5-NEXT: [[CONV2:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK5-NEXT: store i32 [[TMP2]], i32* [[CONV2]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK5-NEXT: [[CONV3:%.*]] = bitcast i64* [[L_CASTED]] to i32* +// CHECK5-NEXT: store i32 [[TMP4]], i32* [[CONV3]], align 4 +// CHECK5-NEXT: [[TMP5:%.*]] = load i64, i64* [[L_CASTED]], align 8 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK5-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i32]* [[TMP0]], i64 [[TMP5]]) #[[ATTR2:[0-9]+]] +// CHECK5-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK5: .omp.deinit: +// CHECK5-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK5-NEXT: br label [[DOTEXIT:%.*]] +// CHECK5: .exit: +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK5-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK5-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 +// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK5-NEXT: [[L_CASTED:%.*]] = alloca i64, align 8 +// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 8 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK5-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK5-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK5-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 +// CHECK5-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK5-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK5-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* +// CHECK5-NEXT: [[L2:%.*]] = call i8* @__kmpc_alloc_shared(i64 4) +// CHECK5-NEXT: [[L_ON_STACK:%.*]] = bitcast i8* [[L2]] to i32* +// CHECK5-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK5-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK5-NEXT: [[SUB4:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK5-NEXT: store i32 [[SUB4]], i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK5-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK5: omp.precond.then: +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK5-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) +// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK5-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK5-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK5: cond.true: +// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK5-NEXT: br label [[COND_END:%.*]] +// CHECK5: cond.false: +// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: br label [[COND_END]] +// CHECK5: cond.end: +// CHECK5-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK5-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK5: omp.inner.for.cond: +// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK5-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK5-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK5: omp.inner.for.body: +// CHECK5-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK5-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 +// CHECK5-NEXT: [[TMP18:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK5-NEXT: [[CONV8:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK5-NEXT: store i32 [[TMP18]], i32* [[CONV8]], align 4 +// CHECK5-NEXT: [[TMP19:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK5-NEXT: [[TMP20:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK5-NEXT: [[CONV9:%.*]] = bitcast i64* [[L_CASTED]] to i32* +// CHECK5-NEXT: store i32 [[TMP20]], i32* [[CONV9]], align 4 +// CHECK5-NEXT: [[TMP21:%.*]] = load i64, i64* [[L_CASTED]], align 8 +// CHECK5-NEXT: [[TMP22:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK5-NEXT: [[TMP23:%.*]] = inttoptr i64 [[TMP15]] to i8* +// CHECK5-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 8 +// CHECK5-NEXT: [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK5-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP17]] to i8* +// CHECK5-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 8 +// CHECK5-NEXT: [[TMP26:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK5-NEXT: [[TMP27:%.*]] = inttoptr i64 [[TMP19]] to i8* +// CHECK5-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 8 +// CHECK5-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK5-NEXT: [[TMP29:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK5-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 8 +// CHECK5-NEXT: [[TMP30:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 4 +// CHECK5-NEXT: [[TMP31:%.*]] = inttoptr i64 [[TMP21]] to i8* +// CHECK5-NEXT: store i8* [[TMP31]], i8** [[TMP30]], align 8 +// CHECK5-NEXT: [[TMP32:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK5-NEXT: [[TMP33:%.*]] = load i32, i32* [[TMP32]], align 4 +// CHECK5-NEXT: [[TMP34:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP33]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i32]*, i64)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP34]], i64 5) +// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK5: omp.inner.for.inc: +// CHECK5-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] +// CHECK5-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP37]], [[TMP38]] +// CHECK5-NEXT: store i32 [[ADD11]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD12:%.*]] = add nsw i32 [[TMP39]], [[TMP40]] +// CHECK5-NEXT: store i32 [[ADD12]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK5-NEXT: [[CMP13:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]] +// CHECK5-NEXT: br i1 [[CMP13]], label [[COND_TRUE14:%.*]], label [[COND_FALSE15:%.*]] +// CHECK5: cond.true14: +// CHECK5-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK5-NEXT: br label [[COND_END16:%.*]] +// CHECK5: cond.false15: +// CHECK5-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: br label [[COND_END16]] +// CHECK5: cond.end16: +// CHECK5-NEXT: [[COND17:%.*]] = phi i32 [ [[TMP43]], [[COND_TRUE14]] ], [ [[TMP44]], [[COND_FALSE15]] ] +// CHECK5-NEXT: store i32 [[COND17]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP45]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK5: omp.inner.for.end: +// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK5: omp.loop.exit: +// CHECK5-NEXT: [[TMP46:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK5-NEXT: [[TMP47:%.*]] = load i32, i32* [[TMP46]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP47]]) +// CHECK5-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[TMP49:%.*]] = icmp ne i32 [[TMP48]], 0 +// CHECK5-NEXT: br i1 [[TMP49]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK5: .omp.lastprivate.then: +// CHECK5-NEXT: [[TMP50:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK5-NEXT: store i32 [[TMP50]], i32* [[CONV1]], align 8 +// CHECK5-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK5: .omp.lastprivate.done: +// CHECK5-NEXT: br label [[OMP_PRECOND_END]] +// CHECK5: omp.precond.end: +// CHECK5-NEXT: call void @__kmpc_free_shared(i8* [[L2]]) +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK5-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK5-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK5-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK5-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 +// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[I6:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK5-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK5-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK5-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK5-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK5-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 +// CHECK5-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK5-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK5-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* +// CHECK5-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK5-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK5-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK5-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK5-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK5: omp.precond.then: +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK5-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK5-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP5]] to i32 +// CHECK5-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK5-NEXT: [[CONV5:%.*]] = trunc i64 [[TMP6]] to i32 +// CHECK5-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: store i32 [[CONV5]], i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) +// CHECK5-NEXT: br label [[OMP_DISPATCH_COND:%.*]] +// CHECK5: omp.dispatch.cond: +// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: [[CONV7:%.*]] = sext i32 [[TMP9]] to i64 +// CHECK5-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK5-NEXT: [[CMP8:%.*]] = icmp ugt i64 [[CONV7]], [[TMP10]] +// CHECK5-NEXT: br i1 [[CMP8]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK5: cond.true: +// CHECK5-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK5-NEXT: br label [[COND_END:%.*]] +// CHECK5: cond.false: +// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: [[CONV9:%.*]] = sext i32 [[TMP12]] to i64 +// CHECK5-NEXT: br label [[COND_END]] +// CHECK5: cond.end: +// CHECK5-NEXT: [[COND:%.*]] = phi i64 [ [[TMP11]], [[COND_TRUE]] ], [ [[CONV9]], [[COND_FALSE]] ] +// CHECK5-NEXT: [[CONV10:%.*]] = trunc i64 [[COND]] to i32 +// CHECK5-NEXT: store i32 [[CONV10]], i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: [[CMP11:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] +// CHECK5-NEXT: br i1 [[CMP11]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] +// CHECK5: omp.dispatch.body: +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK5: omp.inner.for.cond: +// CHECK5-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: [[CMP12:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] +// CHECK5-NEXT: br i1 [[CMP12]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK5: omp.inner.for.body: +// CHECK5-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK5-NEXT: store i32 [[ADD]], i32* [[I6]], align 4 +// CHECK5-NEXT: [[TMP19:%.*]] = load i32, i32* [[I6]], align 4 +// CHECK5-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64 +// CHECK5-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK5-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 +// CHECK5-NEXT: [[TMP20:%.*]] = load i32, i32* [[I6]], align 4 +// CHECK5-NEXT: store i32 [[TMP20]], i32* [[CONV1]], align 8 +// CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK5: omp.body.continue: +// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK5: omp.inner.for.inc: +// CHECK5-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[ADD13:%.*]] = add nsw i32 [[TMP21]], 1 +// CHECK5-NEXT: store i32 [[ADD13]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK5: omp.inner.for.end: +// CHECK5-NEXT: br label [[OMP_DISPATCH_INC:%.*]] +// CHECK5: omp.dispatch.inc: +// CHECK5-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK5-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD15:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK5-NEXT: store i32 [[ADD15]], i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: br label [[OMP_DISPATCH_COND]] +// CHECK5: omp.dispatch.end: +// CHECK5-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK5-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) +// CHECK5-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 +// CHECK5-NEXT: br i1 [[TMP29]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK5: .omp.lastprivate.then: +// CHECK5-NEXT: [[TMP30:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK5-NEXT: store i32 [[TMP30]], i32* [[CONV1]], align 8 +// CHECK5-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK5: .omp.lastprivate.done: +// CHECK5-NEXT: br label [[OMP_PRECOND_END]] +// CHECK5: omp.precond.end: +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l41 +// CHECK5-SAME: (i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK5-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 +// CHECK5-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK5-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK5-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK5-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK5-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK5-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK5-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK5: .execute: +// CHECK5-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK5-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK5-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK5-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] +// CHECK5-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK5: .omp.deinit: +// CHECK5-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK5-NEXT: br label [[DOTEXIT:%.*]] +// CHECK5: .exit: +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK5-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK5-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 +// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK5-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK5-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK5-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK5-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK5-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK5-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK5-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK5-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK5-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK5: omp.precond.then: +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK5-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK5-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK5-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK5-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK5: cond.true: +// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK5-NEXT: br label [[COND_END:%.*]] +// CHECK5: cond.false: +// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: br label [[COND_END]] +// CHECK5: cond.end: +// CHECK5-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK5-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK5: omp.inner.for.cond: +// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK5-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK5-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK5: omp.inner.for.body: +// CHECK5-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK5-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 +// CHECK5-NEXT: [[TMP18:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK5-NEXT: [[CONV6:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK5-NEXT: store i32 [[TMP18]], i32* [[CONV6]], align 4 +// CHECK5-NEXT: [[TMP19:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK5-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK5-NEXT: [[TMP21:%.*]] = inttoptr i64 [[TMP15]] to i8* +// CHECK5-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 8 +// CHECK5-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK5-NEXT: [[TMP23:%.*]] = inttoptr i64 [[TMP17]] to i8* +// CHECK5-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 8 +// CHECK5-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK5-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP19]] to i8* +// CHECK5-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 8 +// CHECK5-NEXT: [[TMP26:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK5-NEXT: [[TMP27:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* +// CHECK5-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 8 +// CHECK5-NEXT: [[TMP28:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK5-NEXT: [[TMP29:%.*]] = load i32, i32* [[TMP28]], align 4 +// CHECK5-NEXT: [[TMP30:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP29]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP30]], i64 4) +// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK5: omp.inner.for.inc: +// CHECK5-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] +// CHECK5-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK5-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] +// CHECK5-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK5-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP37]], [[TMP38]] +// CHECK5-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK5: cond.true11: +// CHECK5-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK5-NEXT: br label [[COND_END13:%.*]] +// CHECK5: cond.false12: +// CHECK5-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: br label [[COND_END13]] +// CHECK5: cond.end13: +// CHECK5-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP39]], [[COND_TRUE11]] ], [ [[TMP40]], [[COND_FALSE12]] ] +// CHECK5-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP41]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK5: omp.inner.for.end: +// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK5: omp.loop.exit: +// CHECK5-NEXT: [[TMP42:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK5-NEXT: [[TMP43:%.*]] = load i32, i32* [[TMP42]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP43]]) +// CHECK5-NEXT: br label [[OMP_PRECOND_END]] +// CHECK5: omp.precond.end: +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK5-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK5-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK5-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK5-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 +// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK5-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK5-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK5-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK5-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK5-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK5-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK5-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK5-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK5-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK5-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK5-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK5: omp.precond.then: +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK5-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK5-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP5]] to i32 +// CHECK5-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK5-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP6]] to i32 +// CHECK5-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK5: omp.inner.for.cond: +// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[CONV6:%.*]] = sext i32 [[TMP10]] to i64 +// CHECK5-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK5-NEXT: [[CMP7:%.*]] = icmp ule i64 [[CONV6]], [[TMP11]] +// CHECK5-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK5: omp.inner.for.body: +// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK5-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 +// CHECK5-NEXT: [[TMP13:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK5-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP13]] to i64 +// CHECK5-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK5-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +// CHECK5-NEXT: [[CONV8:%.*]] = sext i16 [[TMP14]] to i32 +// CHECK5-NEXT: [[ADD9:%.*]] = add nsw i32 [[CONV8]], 1 +// CHECK5-NEXT: [[CONV10:%.*]] = trunc i32 [[ADD9]] to i16 +// CHECK5-NEXT: store i16 [[CONV10]], i16* [[ARRAYIDX]], align 2 +// CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK5: omp.body.continue: +// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK5: omp.inner.for.inc: +// CHECK5-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] +// CHECK5-NEXT: store i32 [[ADD11]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK5: omp.inner.for.end: +// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK5: omp.loop.exit: +// CHECK5-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK5-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) +// CHECK5-NEXT: br label [[OMP_PRECOND_END]] +// CHECK5: omp.precond.end: +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46 +// CHECK5-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK5-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK5-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK5-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK5-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK5: .execute: +// CHECK5-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK5-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] +// CHECK5-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK5: .omp.deinit: +// CHECK5-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK5-NEXT: br label [[DOTEXIT:%.*]] +// CHECK5: .exit: +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK5-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 8 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK5-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK5-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK5-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 +// CHECK5-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK5: cond.true: +// CHECK5-NEXT: br label [[COND_END:%.*]] +// CHECK5: cond.false: +// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: br label [[COND_END]] +// CHECK5: cond.end: +// CHECK5-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK5-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK5: omp.inner.for.cond: +// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 +// CHECK5-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK5: omp.inner.for.body: +// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK5-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP8]] to i8* +// CHECK5-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 8 +// CHECK5-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK5-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP10]] to i8* +// CHECK5-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 +// CHECK5-NEXT: [[TMP15:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK5-NEXT: [[TMP16:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK5-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 +// CHECK5-NEXT: [[TMP17:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP17]], i64 3) +// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK5: omp.inner.for.inc: +// CHECK5-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] +// CHECK5-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK5-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK5-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP24]], 9 +// CHECK5-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] +// CHECK5: cond.true5: +// CHECK5-NEXT: br label [[COND_END7:%.*]] +// CHECK5: cond.false6: +// CHECK5-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: br label [[COND_END7]] +// CHECK5: cond.end7: +// CHECK5-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP25]], [[COND_FALSE6]] ] +// CHECK5-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP26]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK5: omp.inner.for.end: +// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK5: omp.loop.exit: +// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__5 +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK5-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK5-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK5-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK5-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK5-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK5-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK5-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK5-NEXT: [[CONV:%.*]] = trunc i64 [[TMP1]] to i32 +// CHECK5-NEXT: [[TMP2:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK5-NEXT: [[CONV1:%.*]] = trunc i64 [[TMP2]] to i32 +// CHECK5-NEXT: store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: store i32 [[CONV1]], i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK5: omp.inner.for.cond: +// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[CONV2:%.*]] = sext i32 [[TMP6]] to i64 +// CHECK5-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK5-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV2]], [[TMP7]] +// CHECK5-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK5: omp.inner.for.body: +// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 +// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK5-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 +// CHECK5-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP9]] to i64 +// CHECK5-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK5-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK5-NEXT: store i32 [[ADD3]], i32* [[ARRAYIDX]], align 4 +// CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK5: omp.body.continue: +// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK5: omp.inner.for.inc: +// CHECK5-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] +// CHECK5-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK5: omp.inner.for.end: +// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK5: omp.loop.exit: +// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l51 +// CHECK5-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK5-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 +// CHECK5-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8 +// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK5-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK5-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 +// CHECK5-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK5-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK5-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK5-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK5: .execute: +// CHECK5-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK5-NEXT: [[CONV1:%.*]] = bitcast i64* [[F_CASTED]] to i32* +// CHECK5-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i64, i64* [[F_CASTED]], align 8 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK5-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i64 [[TMP3]]) #[[ATTR2]] +// CHECK5-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK5: .omp.deinit: +// CHECK5-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK5-NEXT: br label [[DOTEXIT:%.*]] +// CHECK5: .exit: +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__6 +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK5-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK5-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 +// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8 +// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK5-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK5-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 +// CHECK5-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK5-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK5-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 +// CHECK5-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK5: cond.true: +// CHECK5-NEXT: br label [[COND_END:%.*]] +// CHECK5: cond.false: +// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: br label [[COND_END]] +// CHECK5: cond.end: +// CHECK5-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK5-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK5: omp.inner.for.cond: +// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 +// CHECK5-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK5: omp.inner.for.body: +// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK5-NEXT: [[TMP11:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK5-NEXT: [[CONV3:%.*]] = bitcast i64* [[F_CASTED]] to i32* +// CHECK5-NEXT: store i32 [[TMP11]], i32* [[CONV3]], align 4 +// CHECK5-NEXT: [[TMP12:%.*]] = load i64, i64* [[F_CASTED]], align 8 +// CHECK5-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK5-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP8]] to i8* +// CHECK5-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 +// CHECK5-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK5-NEXT: [[TMP16:%.*]] = inttoptr i64 [[TMP10]] to i8* +// CHECK5-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 +// CHECK5-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK5-NEXT: [[TMP18:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK5-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 8 +// CHECK5-NEXT: [[TMP19:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK5-NEXT: [[TMP20:%.*]] = inttoptr i64 [[TMP12]] to i8* +// CHECK5-NEXT: store i8* [[TMP20]], i8** [[TMP19]], align 8 +// CHECK5-NEXT: [[TMP21:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, [10 x [10 x i32]]*, i64)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP21]], i64 4) +// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK5: omp.inner.for.inc: +// CHECK5-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK5-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK5-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP26]], [[TMP27]] +// CHECK5-NEXT: store i32 [[ADD5]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP28]], 99 +// CHECK5-NEXT: br i1 [[CMP6]], label [[COND_TRUE7:%.*]], label [[COND_FALSE8:%.*]] +// CHECK5: cond.true7: +// CHECK5-NEXT: br label [[COND_END9:%.*]] +// CHECK5: cond.false8: +// CHECK5-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: br label [[COND_END9]] +// CHECK5: cond.end9: +// CHECK5-NEXT: [[COND10:%.*]] = phi i32 [ 99, [[COND_TRUE7]] ], [ [[TMP29]], [[COND_FALSE8]] ] +// CHECK5-NEXT: store i32 [[COND10]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP30]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK5: omp.inner.for.end: +// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK5: omp.loop.exit: +// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__7 +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK5-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK5-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK5-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK5-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 +// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK5-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK5-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK5-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK5-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 +// CHECK5-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK5-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK5-NEXT: [[CONV2:%.*]] = trunc i64 [[TMP1]] to i32 +// CHECK5-NEXT: [[TMP2:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK5-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP2]] to i32 +// CHECK5-NEXT: store i32 [[CONV2]], i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK5: omp.inner.for.cond: +// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[CONV4:%.*]] = sext i32 [[TMP6]] to i64 +// CHECK5-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK5-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV4]], [[TMP7]] +// CHECK5-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK5: omp.inner.for.body: +// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 +// CHECK5-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 +// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK5-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[DIV5:%.*]] = sdiv i32 [[TMP10]], 10 +// CHECK5-NEXT: [[MUL6:%.*]] = mul nsw i32 [[DIV5]], 10 +// CHECK5-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL6]] +// CHECK5-NEXT: [[MUL7:%.*]] = mul nsw i32 [[SUB]], 1 +// CHECK5-NEXT: [[ADD8:%.*]] = add nsw i32 0, [[MUL7]] +// CHECK5-NEXT: store i32 [[ADD8]], i32* [[J]], align 4 +// CHECK5-NEXT: store i32 10, i32* [[K]], align 4 +// CHECK5-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 +// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 +// CHECK5-NEXT: [[TMP13:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK5-NEXT: [[MUL9:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] +// CHECK5-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP11]], [[MUL9]] +// CHECK5-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 +// CHECK5-NEXT: [[ADD11:%.*]] = add nsw i32 [[ADD10]], [[TMP14]] +// CHECK5-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 +// CHECK5-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP15]] to i64 +// CHECK5-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK5-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 +// CHECK5-NEXT: [[IDXPROM12:%.*]] = sext i32 [[TMP16]] to i64 +// CHECK5-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i64 0, i64 [[IDXPROM12]] +// CHECK5-NEXT: store i32 [[ADD11]], i32* [[ARRAYIDX13]], align 4 +// CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK5: omp.body.continue: +// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK5: omp.inner.for.inc: +// CHECK5-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK5-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK5: omp.inner.for.end: +// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK5: omp.loop.exit: +// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59 +// CHECK5-SAME: (i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK5-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK5-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK5-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK5-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK5-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK5-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK5-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK5-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK5: .execute: +// CHECK5-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK5-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK5-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK5-NEXT: call void @__omp_outlined__8(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [10 x [10 x i32]]* [[TMP0]]) #[[ATTR2]] +// CHECK5-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK5: .omp.deinit: +// CHECK5-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK5-NEXT: br label [[DOTEXIT:%.*]] +// CHECK5: .exit: +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__8 +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK5-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK5-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 +// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 +// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i64, align 8 +// CHECK5-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i64, align 8 +// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 +// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[I10:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[J11:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK5-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK5-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK5-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK5-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK5-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK5-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 +// CHECK5-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK5-NEXT: [[CONV4:%.*]] = sext i32 [[DIV]] to i64 +// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK5-NEXT: [[SUB5:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK5-NEXT: [[DIV6:%.*]] = sdiv i32 [[SUB5]], 1 +// CHECK5-NEXT: [[CONV7:%.*]] = sext i32 [[DIV6]] to i64 +// CHECK5-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV4]], [[CONV7]] +// CHECK5-NEXT: [[SUB8:%.*]] = sub nsw i64 [[MUL]], 1 +// CHECK5-NEXT: store i64 [[SUB8]], i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK5-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[J]], align 4 +// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK5-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK5: land.lhs.true: +// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK5-NEXT: [[CMP9:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK5-NEXT: br i1 [[CMP9]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] +// CHECK5: omp.precond.then: +// CHECK5-NEXT: store i64 0, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK5-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK5-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK5-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK5-NEXT: [[CONV12:%.*]] = zext i32 [[NVPTX_NUM_THREADS]] to i64 +// CHECK5-NEXT: [[TMP8:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB1]], i32 [[TMP9]], i32 91, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_COMB_LB]], i64* [[DOTOMP_COMB_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 [[CONV12]]) +// CHECK5-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK5-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK5-NEXT: [[CMP13:%.*]] = icmp sgt i64 [[TMP10]], [[TMP11]] +// CHECK5-NEXT: br i1 [[CMP13]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK5: cond.true: +// CHECK5-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK5-NEXT: br label [[COND_END:%.*]] +// CHECK5: cond.false: +// CHECK5-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK5-NEXT: br label [[COND_END]] +// CHECK5: cond.end: +// CHECK5-NEXT: [[COND:%.*]] = phi i64 [ [[TMP12]], [[COND_TRUE]] ], [ [[TMP13]], [[COND_FALSE]] ] +// CHECK5-NEXT: store i64 [[COND]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK5-NEXT: [[TMP14:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK5-NEXT: store i64 [[TMP14]], i64* [[DOTOMP_IV]], align 8 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK5: omp.inner.for.cond: +// CHECK5-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK5-NEXT: [[TMP16:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK5-NEXT: [[ADD:%.*]] = add nsw i64 [[TMP16]], 1 +// CHECK5-NEXT: [[CMP14:%.*]] = icmp slt i64 [[TMP15]], [[ADD]] +// CHECK5-NEXT: br i1 [[CMP14]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK5: omp.inner.for.body: +// CHECK5-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK5-NEXT: [[TMP18:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK5-NEXT: [[TMP19:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK5-NEXT: [[CONV15:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK5-NEXT: store i32 [[TMP19]], i32* [[CONV15]], align 4 +// CHECK5-NEXT: [[TMP20:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK5-NEXT: [[TMP21:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK5-NEXT: [[TMP22:%.*]] = inttoptr i64 [[TMP17]] to i8* +// CHECK5-NEXT: store i8* [[TMP22]], i8** [[TMP21]], align 8 +// CHECK5-NEXT: [[TMP23:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK5-NEXT: [[TMP24:%.*]] = inttoptr i64 [[TMP18]] to i8* +// CHECK5-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 8 +// CHECK5-NEXT: [[TMP25:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK5-NEXT: [[TMP26:%.*]] = inttoptr i64 [[TMP20]] to i8* +// CHECK5-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 8 +// CHECK5-NEXT: [[TMP27:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK5-NEXT: [[TMP28:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK5-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 8 +// CHECK5-NEXT: [[TMP29:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK5-NEXT: [[TMP30:%.*]] = load i32, i32* [[TMP29]], align 4 +// CHECK5-NEXT: [[TMP31:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP30]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [10 x [10 x i32]]*)* @__omp_outlined__9 to i8*), i8* null, i8** [[TMP31]], i64 4) +// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK5: omp.inner.for.inc: +// CHECK5-NEXT: [[TMP32:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK5-NEXT: [[TMP33:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK5-NEXT: [[ADD16:%.*]] = add nsw i64 [[TMP32]], [[TMP33]] +// CHECK5-NEXT: store i64 [[ADD16]], i64* [[DOTOMP_IV]], align 8 +// CHECK5-NEXT: [[TMP34:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK5-NEXT: [[TMP35:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK5-NEXT: [[ADD17:%.*]] = add nsw i64 [[TMP34]], [[TMP35]] +// CHECK5-NEXT: store i64 [[ADD17]], i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK5-NEXT: [[TMP36:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK5-NEXT: [[TMP37:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK5-NEXT: [[ADD18:%.*]] = add nsw i64 [[TMP36]], [[TMP37]] +// CHECK5-NEXT: store i64 [[ADD18]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK5-NEXT: [[TMP38:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK5-NEXT: [[TMP39:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK5-NEXT: [[CMP19:%.*]] = icmp sgt i64 [[TMP38]], [[TMP39]] +// CHECK5-NEXT: br i1 [[CMP19]], label [[COND_TRUE20:%.*]], label [[COND_FALSE21:%.*]] +// CHECK5: cond.true20: +// CHECK5-NEXT: [[TMP40:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK5-NEXT: br label [[COND_END22:%.*]] +// CHECK5: cond.false21: +// CHECK5-NEXT: [[TMP41:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK5-NEXT: br label [[COND_END22]] +// CHECK5: cond.end22: +// CHECK5-NEXT: [[COND23:%.*]] = phi i64 [ [[TMP40]], [[COND_TRUE20]] ], [ [[TMP41]], [[COND_FALSE21]] ] +// CHECK5-NEXT: store i64 [[COND23]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK5-NEXT: [[TMP42:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK5-NEXT: store i64 [[TMP42]], i64* [[DOTOMP_IV]], align 8 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK5: omp.inner.for.end: +// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK5: omp.loop.exit: +// CHECK5-NEXT: [[TMP43:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK5-NEXT: [[TMP44:%.*]] = load i32, i32* [[TMP43]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP44]]) +// CHECK5-NEXT: br label [[OMP_PRECOND_END]] +// CHECK5: omp.precond.end: +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__9 +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK5-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK5-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK5-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK5-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 +// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 +// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_LB:%.*]] = alloca i64, align 8 +// CHECK5-NEXT: [[DOTOMP_UB:%.*]] = alloca i64, align 8 +// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 +// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[I10:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[J11:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK5-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK5-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK5-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK5-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK5-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK5-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK5-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK5-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 +// CHECK5-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK5-NEXT: [[CONV4:%.*]] = sext i32 [[DIV]] to i64 +// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK5-NEXT: [[SUB5:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK5-NEXT: [[DIV6:%.*]] = sdiv i32 [[SUB5]], 1 +// CHECK5-NEXT: [[CONV7:%.*]] = sext i32 [[DIV6]] to i64 +// CHECK5-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV4]], [[CONV7]] +// CHECK5-NEXT: [[SUB8:%.*]] = sub nsw i64 [[MUL]], 1 +// CHECK5-NEXT: store i64 [[SUB8]], i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK5-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[J]], align 4 +// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK5-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK5: land.lhs.true: +// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK5-NEXT: [[CMP9:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK5-NEXT: br i1 [[CMP9]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] +// CHECK5: omp.precond.then: +// CHECK5-NEXT: store i64 0, i64* [[DOTOMP_LB]], align 8 +// CHECK5-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK5-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_UB]], align 8 +// CHECK5-NEXT: [[TMP8:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK5-NEXT: [[TMP9:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK5-NEXT: store i64 [[TMP8]], i64* [[DOTOMP_LB]], align 8 +// CHECK5-NEXT: store i64 [[TMP9]], i64* [[DOTOMP_UB]], align 8 +// CHECK5-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK5-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB2]], i32 [[TMP11]], i32 33, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_LB]], i64* [[DOTOMP_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 1) +// CHECK5-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTOMP_LB]], align 8 +// CHECK5-NEXT: store i64 [[TMP12]], i64* [[DOTOMP_IV]], align 8 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK5: omp.inner.for.cond: +// CHECK5-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK5-NEXT: [[TMP14:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK5-NEXT: [[CMP12:%.*]] = icmp ule i64 [[TMP13]], [[TMP14]] +// CHECK5-NEXT: br i1 [[CMP12]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK5: omp.inner.for.body: +// CHECK5-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK5-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK5-NEXT: [[SUB13:%.*]] = sub nsw i32 [[TMP16]], 0 +// CHECK5-NEXT: [[DIV14:%.*]] = sdiv i32 [[SUB13]], 1 +// CHECK5-NEXT: [[MUL15:%.*]] = mul nsw i32 1, [[DIV14]] +// CHECK5-NEXT: [[CONV16:%.*]] = sext i32 [[MUL15]] to i64 +// CHECK5-NEXT: [[DIV17:%.*]] = sdiv i64 [[TMP15]], [[CONV16]] +// CHECK5-NEXT: [[MUL18:%.*]] = mul nsw i64 [[DIV17]], 1 +// CHECK5-NEXT: [[ADD:%.*]] = add nsw i64 0, [[MUL18]] +// CHECK5-NEXT: [[CONV19:%.*]] = trunc i64 [[ADD]] to i32 +// CHECK5-NEXT: store i32 [[CONV19]], i32* [[I10]], align 4 +// CHECK5-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK5-NEXT: [[TMP18:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK5-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK5-NEXT: [[SUB20:%.*]] = sub nsw i32 [[TMP19]], 0 +// CHECK5-NEXT: [[DIV21:%.*]] = sdiv i32 [[SUB20]], 1 +// CHECK5-NEXT: [[MUL22:%.*]] = mul nsw i32 1, [[DIV21]] +// CHECK5-NEXT: [[CONV23:%.*]] = sext i32 [[MUL22]] to i64 +// CHECK5-NEXT: [[DIV24:%.*]] = sdiv i64 [[TMP18]], [[CONV23]] +// CHECK5-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK5-NEXT: [[SUB25:%.*]] = sub nsw i32 [[TMP20]], 0 +// CHECK5-NEXT: [[DIV26:%.*]] = sdiv i32 [[SUB25]], 1 +// CHECK5-NEXT: [[MUL27:%.*]] = mul nsw i32 1, [[DIV26]] +// CHECK5-NEXT: [[CONV28:%.*]] = sext i32 [[MUL27]] to i64 +// CHECK5-NEXT: [[MUL29:%.*]] = mul nsw i64 [[DIV24]], [[CONV28]] +// CHECK5-NEXT: [[SUB30:%.*]] = sub nsw i64 [[TMP17]], [[MUL29]] +// CHECK5-NEXT: [[MUL31:%.*]] = mul nsw i64 [[SUB30]], 1 +// CHECK5-NEXT: [[ADD32:%.*]] = add nsw i64 0, [[MUL31]] +// CHECK5-NEXT: [[CONV33:%.*]] = trunc i64 [[ADD32]] to i32 +// CHECK5-NEXT: store i32 [[CONV33]], i32* [[J11]], align 4 +// CHECK5-NEXT: [[TMP21:%.*]] = load i32, i32* [[I10]], align 4 +// CHECK5-NEXT: [[TMP22:%.*]] = load i32, i32* [[J11]], align 4 +// CHECK5-NEXT: [[ADD34:%.*]] = add nsw i32 [[TMP21]], [[TMP22]] +// CHECK5-NEXT: [[TMP23:%.*]] = load i32, i32* [[I10]], align 4 +// CHECK5-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP23]] to i64 +// CHECK5-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK5-NEXT: [[TMP24:%.*]] = load i32, i32* [[J11]], align 4 +// CHECK5-NEXT: [[IDXPROM35:%.*]] = sext i32 [[TMP24]] to i64 +// CHECK5-NEXT: [[ARRAYIDX36:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i64 0, i64 [[IDXPROM35]] +// CHECK5-NEXT: store i32 [[ADD34]], i32* [[ARRAYIDX36]], align 4 +// CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK5: omp.body.continue: +// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK5: omp.inner.for.inc: +// CHECK5-NEXT: [[TMP25:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK5-NEXT: [[TMP26:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK5-NEXT: [[ADD37:%.*]] = add nsw i64 [[TMP25]], [[TMP26]] +// CHECK5-NEXT: store i64 [[ADD37]], i64* [[DOTOMP_IV]], align 8 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK5: omp.inner.for.end: +// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK5: omp.loop.exit: +// CHECK5-NEXT: [[TMP27:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK5-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP27]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP28]]) +// CHECK5-NEXT: br label [[OMP_PRECOND_END]] +// CHECK5: omp.precond.end: +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l66 +// CHECK5-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK5-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 8 +// CHECK5-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK5-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK5-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK5-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 8 +// CHECK5-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK5-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK5-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK5-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK5: .execute: +// CHECK5-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK5-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK5-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK5-NEXT: [[TMP4:%.*]] = load i32*, i32** [[V_ADDR]], align 8 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK5-NEXT: call void @__omp_outlined__10(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i32]* [[TMP0]], i32* [[TMP4]]) #[[ATTR2]] +// CHECK5-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK5: .omp.deinit: +// CHECK5-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK5-NEXT: br label [[DOTEXIT:%.*]] +// CHECK5: .exit: +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__10 +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK5-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK5-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 8 +// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 8 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK5-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK5-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK5-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 8 +// CHECK5-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK5-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK5-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK5-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK5-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK5-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK5-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK5: omp.precond.then: +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK5-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK5-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK5-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK5-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK5: cond.true: +// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK5-NEXT: br label [[COND_END:%.*]] +// CHECK5: cond.false: +// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: br label [[COND_END]] +// CHECK5: cond.end: +// CHECK5-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK5-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK5: omp.inner.for.cond: +// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK5-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK5-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK5: omp.inner.for.body: +// CHECK5-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK5-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 +// CHECK5-NEXT: [[TMP18:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK5-NEXT: [[CONV6:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK5-NEXT: store i32 [[TMP18]], i32* [[CONV6]], align 4 +// CHECK5-NEXT: [[TMP19:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK5-NEXT: [[TMP20:%.*]] = load i32*, i32** [[V_ADDR]], align 8 +// CHECK5-NEXT: [[TMP21:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK5-NEXT: [[TMP22:%.*]] = inttoptr i64 [[TMP15]] to i8* +// CHECK5-NEXT: store i8* [[TMP22]], i8** [[TMP21]], align 8 +// CHECK5-NEXT: [[TMP23:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK5-NEXT: [[TMP24:%.*]] = inttoptr i64 [[TMP17]] to i8* +// CHECK5-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 8 +// CHECK5-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK5-NEXT: [[TMP26:%.*]] = inttoptr i64 [[TMP19]] to i8* +// CHECK5-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 8 +// CHECK5-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK5-NEXT: [[TMP28:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK5-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 8 +// CHECK5-NEXT: [[TMP29:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 4 +// CHECK5-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP20]] to i8* +// CHECK5-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 8 +// CHECK5-NEXT: [[TMP31:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK5-NEXT: [[TMP32:%.*]] = load i32, i32* [[TMP31]], align 4 +// CHECK5-NEXT: [[TMP33:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP32]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i32]*, i32*)* @__omp_outlined__11 to i8*), i8* null, i8** [[TMP33]], i64 5) +// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK5: omp.inner.for.inc: +// CHECK5-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP34]], [[TMP35]] +// CHECK5-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP36]], [[TMP37]] +// CHECK5-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP38]], [[TMP39]] +// CHECK5-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK5-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP40]], [[TMP41]] +// CHECK5-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK5: cond.true11: +// CHECK5-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK5-NEXT: br label [[COND_END13:%.*]] +// CHECK5: cond.false12: +// CHECK5-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: br label [[COND_END13]] +// CHECK5: cond.end13: +// CHECK5-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP42]], [[COND_TRUE11]] ], [ [[TMP43]], [[COND_FALSE12]] ] +// CHECK5-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP44]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK5: omp.inner.for.end: +// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK5: omp.loop.exit: +// CHECK5-NEXT: [[TMP45:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK5-NEXT: [[TMP46:%.*]] = load i32, i32* [[TMP45]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP46]]) +// CHECK5-NEXT: br label [[OMP_PRECOND_END]] +// CHECK5: omp.precond.end: +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__11 +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK5-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK5-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK5-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK5-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 8 +// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK5-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK5-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK5-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK5-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK5-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 8 +// CHECK5-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK5-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK5-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK5-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK5-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK5-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK5-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK5: omp.precond.then: +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK5-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK5-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP5]] to i32 +// CHECK5-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK5-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP6]] to i32 +// CHECK5-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK5: omp.inner.for.cond: +// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[CONV6:%.*]] = sext i32 [[TMP10]] to i64 +// CHECK5-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK5-NEXT: [[CMP7:%.*]] = icmp ule i64 [[CONV6]], [[TMP11]] +// CHECK5-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK5: omp.inner.for.body: +// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK5-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 +// CHECK5-NEXT: [[TMP13:%.*]] = load i32*, i32** [[V_ADDR]], align 8 +// CHECK5-NEXT: [[TMP14:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK5-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP14]] to i64 +// CHECK5-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TMP13]], i64 [[IDXPROM]] +// CHECK5-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK5-NEXT: [[TMP16:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK5-NEXT: [[IDXPROM8:%.*]] = sext i32 [[TMP16]] to i64 +// CHECK5-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM8]] +// CHECK5-NEXT: store i32 [[TMP15]], i32* [[ARRAYIDX9]], align 4 +// CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK5: omp.body.continue: +// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK5: omp.inner.for.inc: +// CHECK5-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK5-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK5: omp.inner.for.end: +// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK5: omp.loop.exit: +// CHECK5-NEXT: [[TMP19:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK5-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP20]]) +// CHECK5-NEXT: br label [[OMP_PRECOND_END]] +// CHECK5: omp.precond.end: +// CHECK5-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l35 +// CHECK6-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK6-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 +// CHECK6-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK6-NEXT: [[L_CASTED:%.*]] = alloca i64, align 8 +// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK6-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK6-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK6-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 +// CHECK6-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK6-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK6-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* +// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14:![0-9]+]] +// CHECK6-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK6-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK6: .execute: +// CHECK6-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK6-NEXT: [[CONV2:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK6-NEXT: store i32 [[TMP2]], i32* [[CONV2]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK6-NEXT: [[CONV3:%.*]] = bitcast i64* [[L_CASTED]] to i32* +// CHECK6-NEXT: store i32 [[TMP4]], i32* [[CONV3]], align 4 +// CHECK6-NEXT: [[TMP5:%.*]] = load i64, i64* [[L_CASTED]], align 8 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK6-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i32]* [[TMP0]], i64 [[TMP5]]) #[[ATTR2:[0-9]+]] +// CHECK6-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK6: .omp.deinit: +// CHECK6-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK6-NEXT: br label [[DOTEXIT:%.*]] +// CHECK6: .exit: +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK6-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK6-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 +// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK6-NEXT: [[L_CASTED:%.*]] = alloca i64, align 8 +// CHECK6-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 8 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK6-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK6-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK6-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 +// CHECK6-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK6-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK6-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* +// CHECK6-NEXT: [[L2:%.*]] = call i8* @__kmpc_alloc_shared(i64 4) +// CHECK6-NEXT: [[L_ON_STACK:%.*]] = bitcast i8* [[L2]] to i32* +// CHECK6-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK6-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK6-NEXT: [[SUB4:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK6-NEXT: store i32 [[SUB4]], i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK6-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK6: omp.precond.then: +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK6-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) +// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK6-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK6-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK6: cond.true: +// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK6-NEXT: br label [[COND_END:%.*]] +// CHECK6: cond.false: +// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: br label [[COND_END]] +// CHECK6: cond.end: +// CHECK6-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK6-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK6: omp.inner.for.cond: +// CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK6-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK6-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK6: omp.inner.for.body: +// CHECK6-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK6-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 +// CHECK6-NEXT: [[TMP18:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK6-NEXT: [[CONV8:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK6-NEXT: store i32 [[TMP18]], i32* [[CONV8]], align 4 +// CHECK6-NEXT: [[TMP19:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK6-NEXT: [[TMP20:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK6-NEXT: [[CONV9:%.*]] = bitcast i64* [[L_CASTED]] to i32* +// CHECK6-NEXT: store i32 [[TMP20]], i32* [[CONV9]], align 4 +// CHECK6-NEXT: [[TMP21:%.*]] = load i64, i64* [[L_CASTED]], align 8 +// CHECK6-NEXT: [[TMP22:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK6-NEXT: [[TMP23:%.*]] = inttoptr i64 [[TMP15]] to i8* +// CHECK6-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 8 +// CHECK6-NEXT: [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK6-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP17]] to i8* +// CHECK6-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 8 +// CHECK6-NEXT: [[TMP26:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK6-NEXT: [[TMP27:%.*]] = inttoptr i64 [[TMP19]] to i8* +// CHECK6-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 8 +// CHECK6-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK6-NEXT: [[TMP29:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK6-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 8 +// CHECK6-NEXT: [[TMP30:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 4 +// CHECK6-NEXT: [[TMP31:%.*]] = inttoptr i64 [[TMP21]] to i8* +// CHECK6-NEXT: store i8* [[TMP31]], i8** [[TMP30]], align 8 +// CHECK6-NEXT: [[TMP32:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK6-NEXT: [[TMP33:%.*]] = load i32, i32* [[TMP32]], align 4 +// CHECK6-NEXT: [[TMP34:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP33]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i32]*, i64)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP34]], i64 5) +// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK6: omp.inner.for.inc: +// CHECK6-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] +// CHECK6-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP37]], [[TMP38]] +// CHECK6-NEXT: store i32 [[ADD11]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD12:%.*]] = add nsw i32 [[TMP39]], [[TMP40]] +// CHECK6-NEXT: store i32 [[ADD12]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK6-NEXT: [[CMP13:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]] +// CHECK6-NEXT: br i1 [[CMP13]], label [[COND_TRUE14:%.*]], label [[COND_FALSE15:%.*]] +// CHECK6: cond.true14: +// CHECK6-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK6-NEXT: br label [[COND_END16:%.*]] +// CHECK6: cond.false15: +// CHECK6-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: br label [[COND_END16]] +// CHECK6: cond.end16: +// CHECK6-NEXT: [[COND17:%.*]] = phi i32 [ [[TMP43]], [[COND_TRUE14]] ], [ [[TMP44]], [[COND_FALSE15]] ] +// CHECK6-NEXT: store i32 [[COND17]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP45]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK6: omp.inner.for.end: +// CHECK6-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK6: omp.loop.exit: +// CHECK6-NEXT: [[TMP46:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK6-NEXT: [[TMP47:%.*]] = load i32, i32* [[TMP46]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP47]]) +// CHECK6-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[TMP49:%.*]] = icmp ne i32 [[TMP48]], 0 +// CHECK6-NEXT: br i1 [[TMP49]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK6: .omp.lastprivate.then: +// CHECK6-NEXT: [[TMP50:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK6-NEXT: store i32 [[TMP50]], i32* [[CONV1]], align 8 +// CHECK6-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK6: .omp.lastprivate.done: +// CHECK6-NEXT: br label [[OMP_PRECOND_END]] +// CHECK6: omp.precond.end: +// CHECK6-NEXT: call void @__kmpc_free_shared(i8* [[L2]]) +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK6-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK6-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK6-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK6-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 +// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[I6:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK6-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK6-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK6-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK6-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK6-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 +// CHECK6-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK6-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK6-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* +// CHECK6-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK6-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK6-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK6-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK6-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK6: omp.precond.then: +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK6-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK6-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP5]] to i32 +// CHECK6-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK6-NEXT: [[CONV5:%.*]] = trunc i64 [[TMP6]] to i32 +// CHECK6-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: store i32 [[CONV5]], i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK6-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) +// CHECK6-NEXT: br label [[OMP_DISPATCH_COND:%.*]] +// CHECK6: omp.dispatch.cond: +// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: [[CONV7:%.*]] = sext i32 [[TMP9]] to i64 +// CHECK6-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK6-NEXT: [[CMP8:%.*]] = icmp ugt i64 [[CONV7]], [[TMP10]] +// CHECK6-NEXT: br i1 [[CMP8]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK6: cond.true: +// CHECK6-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK6-NEXT: br label [[COND_END:%.*]] +// CHECK6: cond.false: +// CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: [[CONV9:%.*]] = sext i32 [[TMP12]] to i64 +// CHECK6-NEXT: br label [[COND_END]] +// CHECK6: cond.end: +// CHECK6-NEXT: [[COND:%.*]] = phi i64 [ [[TMP11]], [[COND_TRUE]] ], [ [[CONV9]], [[COND_FALSE]] ] +// CHECK6-NEXT: [[CONV10:%.*]] = trunc i64 [[COND]] to i32 +// CHECK6-NEXT: store i32 [[CONV10]], i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: [[CMP11:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] +// CHECK6-NEXT: br i1 [[CMP11]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] +// CHECK6: omp.dispatch.body: +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK6: omp.inner.for.cond: +// CHECK6-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: [[CMP12:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] +// CHECK6-NEXT: br i1 [[CMP12]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK6: omp.inner.for.body: +// CHECK6-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK6-NEXT: store i32 [[ADD]], i32* [[I6]], align 4 +// CHECK6-NEXT: [[TMP19:%.*]] = load i32, i32* [[I6]], align 4 +// CHECK6-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64 +// CHECK6-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK6-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 +// CHECK6-NEXT: [[TMP20:%.*]] = load i32, i32* [[I6]], align 4 +// CHECK6-NEXT: store i32 [[TMP20]], i32* [[CONV1]], align 8 +// CHECK6-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK6: omp.body.continue: +// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK6: omp.inner.for.inc: +// CHECK6-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[ADD13:%.*]] = add nsw i32 [[TMP21]], 1 +// CHECK6-NEXT: store i32 [[ADD13]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK6: omp.inner.for.end: +// CHECK6-NEXT: br label [[OMP_DISPATCH_INC:%.*]] +// CHECK6: omp.dispatch.inc: +// CHECK6-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK6-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD15:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK6-NEXT: store i32 [[ADD15]], i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: br label [[OMP_DISPATCH_COND]] +// CHECK6: omp.dispatch.end: +// CHECK6-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK6-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) +// CHECK6-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 +// CHECK6-NEXT: br i1 [[TMP29]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK6: .omp.lastprivate.then: +// CHECK6-NEXT: [[TMP30:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK6-NEXT: store i32 [[TMP30]], i32* [[CONV1]], align 8 +// CHECK6-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK6: .omp.lastprivate.done: +// CHECK6-NEXT: br label [[OMP_PRECOND_END]] +// CHECK6: omp.precond.end: +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l41 +// CHECK6-SAME: (i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK6-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 +// CHECK6-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK6-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK6-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK6-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK6-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK6-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK6-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK6: .execute: +// CHECK6-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK6-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK6-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK6-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] +// CHECK6-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK6: .omp.deinit: +// CHECK6-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK6-NEXT: br label [[DOTEXIT:%.*]] +// CHECK6: .exit: +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK6-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK6-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 +// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK6-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK6-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK6-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK6-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK6-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK6-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK6-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK6-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK6-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK6-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK6: omp.precond.then: +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK6-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK6-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK6-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK6-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK6: cond.true: +// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK6-NEXT: br label [[COND_END:%.*]] +// CHECK6: cond.false: +// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: br label [[COND_END]] +// CHECK6: cond.end: +// CHECK6-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK6-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK6: omp.inner.for.cond: +// CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK6-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK6-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK6: omp.inner.for.body: +// CHECK6-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK6-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 +// CHECK6-NEXT: [[TMP18:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK6-NEXT: [[CONV6:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK6-NEXT: store i32 [[TMP18]], i32* [[CONV6]], align 4 +// CHECK6-NEXT: [[TMP19:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK6-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK6-NEXT: [[TMP21:%.*]] = inttoptr i64 [[TMP15]] to i8* +// CHECK6-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 8 +// CHECK6-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK6-NEXT: [[TMP23:%.*]] = inttoptr i64 [[TMP17]] to i8* +// CHECK6-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 8 +// CHECK6-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK6-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP19]] to i8* +// CHECK6-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 8 +// CHECK6-NEXT: [[TMP26:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK6-NEXT: [[TMP27:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* +// CHECK6-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 8 +// CHECK6-NEXT: [[TMP28:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK6-NEXT: [[TMP29:%.*]] = load i32, i32* [[TMP28]], align 4 +// CHECK6-NEXT: [[TMP30:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP29]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP30]], i64 4) +// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK6: omp.inner.for.inc: +// CHECK6-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] +// CHECK6-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK6-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] +// CHECK6-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK6-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP37]], [[TMP38]] +// CHECK6-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK6: cond.true11: +// CHECK6-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK6-NEXT: br label [[COND_END13:%.*]] +// CHECK6: cond.false12: +// CHECK6-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: br label [[COND_END13]] +// CHECK6: cond.end13: +// CHECK6-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP39]], [[COND_TRUE11]] ], [ [[TMP40]], [[COND_FALSE12]] ] +// CHECK6-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP41]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK6: omp.inner.for.end: +// CHECK6-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK6: omp.loop.exit: +// CHECK6-NEXT: [[TMP42:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK6-NEXT: [[TMP43:%.*]] = load i32, i32* [[TMP42]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP43]]) +// CHECK6-NEXT: br label [[OMP_PRECOND_END]] +// CHECK6: omp.precond.end: +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK6-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK6-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK6-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK6-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 +// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK6-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK6-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK6-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK6-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK6-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK6-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK6-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK6-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK6-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK6-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK6-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK6: omp.precond.then: +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK6-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK6-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP5]] to i32 +// CHECK6-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK6-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP6]] to i32 +// CHECK6-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK6-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK6: omp.inner.for.cond: +// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[CONV6:%.*]] = sext i32 [[TMP10]] to i64 +// CHECK6-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK6-NEXT: [[CMP7:%.*]] = icmp ule i64 [[CONV6]], [[TMP11]] +// CHECK6-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK6: omp.inner.for.body: +// CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK6-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 +// CHECK6-NEXT: [[TMP13:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK6-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP13]] to i64 +// CHECK6-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK6-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +// CHECK6-NEXT: [[CONV8:%.*]] = sext i16 [[TMP14]] to i32 +// CHECK6-NEXT: [[ADD9:%.*]] = add nsw i32 [[CONV8]], 1 +// CHECK6-NEXT: [[CONV10:%.*]] = trunc i32 [[ADD9]] to i16 +// CHECK6-NEXT: store i16 [[CONV10]], i16* [[ARRAYIDX]], align 2 +// CHECK6-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK6: omp.body.continue: +// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK6: omp.inner.for.inc: +// CHECK6-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] +// CHECK6-NEXT: store i32 [[ADD11]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK6: omp.inner.for.end: +// CHECK6-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK6: omp.loop.exit: +// CHECK6-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK6-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) +// CHECK6-NEXT: br label [[OMP_PRECOND_END]] +// CHECK6: omp.precond.end: +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46 +// CHECK6-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK6-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK6-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK6-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK6-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK6: .execute: +// CHECK6-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK6-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] +// CHECK6-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK6: .omp.deinit: +// CHECK6-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK6-NEXT: br label [[DOTEXIT:%.*]] +// CHECK6: .exit: +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK6-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 8 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK6-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK6-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK6-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 +// CHECK6-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK6: cond.true: +// CHECK6-NEXT: br label [[COND_END:%.*]] +// CHECK6: cond.false: +// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: br label [[COND_END]] +// CHECK6: cond.end: +// CHECK6-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK6-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK6: omp.inner.for.cond: +// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 +// CHECK6-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK6: omp.inner.for.body: +// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK6-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP8]] to i8* +// CHECK6-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 8 +// CHECK6-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK6-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP10]] to i8* +// CHECK6-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 +// CHECK6-NEXT: [[TMP15:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK6-NEXT: [[TMP16:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK6-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 +// CHECK6-NEXT: [[TMP17:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP17]], i64 3) +// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK6: omp.inner.for.inc: +// CHECK6-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] +// CHECK6-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK6-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK6-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP24]], 9 +// CHECK6-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] +// CHECK6: cond.true5: +// CHECK6-NEXT: br label [[COND_END7:%.*]] +// CHECK6: cond.false6: +// CHECK6-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: br label [[COND_END7]] +// CHECK6: cond.end7: +// CHECK6-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP25]], [[COND_FALSE6]] ] +// CHECK6-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP26]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK6: omp.inner.for.end: +// CHECK6-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK6: omp.loop.exit: +// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__5 +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK6-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK6-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK6-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK6-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK6-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK6-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK6-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK6-NEXT: [[CONV:%.*]] = trunc i64 [[TMP1]] to i32 +// CHECK6-NEXT: [[TMP2:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK6-NEXT: [[CONV1:%.*]] = trunc i64 [[TMP2]] to i32 +// CHECK6-NEXT: store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: store i32 [[CONV1]], i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK6: omp.inner.for.cond: +// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[CONV2:%.*]] = sext i32 [[TMP6]] to i64 +// CHECK6-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK6-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV2]], [[TMP7]] +// CHECK6-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK6: omp.inner.for.body: +// CHECK6-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 +// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK6-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 +// CHECK6-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP9]] to i64 +// CHECK6-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK6-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK6-NEXT: store i32 [[ADD3]], i32* [[ARRAYIDX]], align 4 +// CHECK6-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK6: omp.body.continue: +// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK6: omp.inner.for.inc: +// CHECK6-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] +// CHECK6-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK6: omp.inner.for.end: +// CHECK6-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK6: omp.loop.exit: +// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l51 +// CHECK6-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK6-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 +// CHECK6-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8 +// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK6-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK6-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 +// CHECK6-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK6-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* +// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK6-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK6-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK6: .execute: +// CHECK6-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK6-NEXT: [[CONV1:%.*]] = bitcast i64* [[F_CASTED]] to i32* +// CHECK6-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i64, i64* [[F_CASTED]], align 8 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK6-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i64 [[TMP3]]) #[[ATTR2]] +// CHECK6-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK6: .omp.deinit: +// CHECK6-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK6-NEXT: br label [[DOTEXIT:%.*]] +// CHECK6: .exit: +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__6 +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK6-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK6-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 +// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8 +// CHECK6-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK6-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK6-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 +// CHECK6-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK6-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK6-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 +// CHECK6-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK6: cond.true: +// CHECK6-NEXT: br label [[COND_END:%.*]] +// CHECK6: cond.false: +// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: br label [[COND_END]] +// CHECK6: cond.end: +// CHECK6-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK6-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK6: omp.inner.for.cond: +// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 +// CHECK6-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK6: omp.inner.for.body: +// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK6-NEXT: [[TMP11:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK6-NEXT: [[CONV3:%.*]] = bitcast i64* [[F_CASTED]] to i32* +// CHECK6-NEXT: store i32 [[TMP11]], i32* [[CONV3]], align 4 +// CHECK6-NEXT: [[TMP12:%.*]] = load i64, i64* [[F_CASTED]], align 8 +// CHECK6-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK6-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP8]] to i8* +// CHECK6-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 +// CHECK6-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK6-NEXT: [[TMP16:%.*]] = inttoptr i64 [[TMP10]] to i8* +// CHECK6-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 +// CHECK6-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK6-NEXT: [[TMP18:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK6-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 8 +// CHECK6-NEXT: [[TMP19:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK6-NEXT: [[TMP20:%.*]] = inttoptr i64 [[TMP12]] to i8* +// CHECK6-NEXT: store i8* [[TMP20]], i8** [[TMP19]], align 8 +// CHECK6-NEXT: [[TMP21:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, [10 x [10 x i32]]*, i64)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP21]], i64 4) +// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK6: omp.inner.for.inc: +// CHECK6-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK6-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK6-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP26]], [[TMP27]] +// CHECK6-NEXT: store i32 [[ADD5]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP28]], 99 +// CHECK6-NEXT: br i1 [[CMP6]], label [[COND_TRUE7:%.*]], label [[COND_FALSE8:%.*]] +// CHECK6: cond.true7: +// CHECK6-NEXT: br label [[COND_END9:%.*]] +// CHECK6: cond.false8: +// CHECK6-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: br label [[COND_END9]] +// CHECK6: cond.end9: +// CHECK6-NEXT: [[COND10:%.*]] = phi i32 [ 99, [[COND_TRUE7]] ], [ [[TMP29]], [[COND_FALSE8]] ] +// CHECK6-NEXT: store i32 [[COND10]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP30]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK6: omp.inner.for.end: +// CHECK6-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK6: omp.loop.exit: +// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__7 +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK6-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK6-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK6-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK6-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 +// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK6-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK6-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK6-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK6-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 +// CHECK6-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK6-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK6-NEXT: [[CONV2:%.*]] = trunc i64 [[TMP1]] to i32 +// CHECK6-NEXT: [[TMP2:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK6-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP2]] to i32 +// CHECK6-NEXT: store i32 [[CONV2]], i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK6: omp.inner.for.cond: +// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[CONV4:%.*]] = sext i32 [[TMP6]] to i64 +// CHECK6-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK6-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV4]], [[TMP7]] +// CHECK6-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK6: omp.inner.for.body: +// CHECK6-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 +// CHECK6-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 +// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK6-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[DIV5:%.*]] = sdiv i32 [[TMP10]], 10 +// CHECK6-NEXT: [[MUL6:%.*]] = mul nsw i32 [[DIV5]], 10 +// CHECK6-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL6]] +// CHECK6-NEXT: [[MUL7:%.*]] = mul nsw i32 [[SUB]], 1 +// CHECK6-NEXT: [[ADD8:%.*]] = add nsw i32 0, [[MUL7]] +// CHECK6-NEXT: store i32 [[ADD8]], i32* [[J]], align 4 +// CHECK6-NEXT: store i32 10, i32* [[K]], align 4 +// CHECK6-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 +// CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 +// CHECK6-NEXT: [[TMP13:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK6-NEXT: [[MUL9:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] +// CHECK6-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP11]], [[MUL9]] +// CHECK6-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 +// CHECK6-NEXT: [[ADD11:%.*]] = add nsw i32 [[ADD10]], [[TMP14]] +// CHECK6-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 +// CHECK6-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP15]] to i64 +// CHECK6-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK6-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 +// CHECK6-NEXT: [[IDXPROM12:%.*]] = sext i32 [[TMP16]] to i64 +// CHECK6-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i64 0, i64 [[IDXPROM12]] +// CHECK6-NEXT: store i32 [[ADD11]], i32* [[ARRAYIDX13]], align 4 +// CHECK6-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK6: omp.body.continue: +// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK6: omp.inner.for.inc: +// CHECK6-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK6-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK6: omp.inner.for.end: +// CHECK6-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK6: omp.loop.exit: +// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59 +// CHECK6-SAME: (i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK6-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK6-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK6-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK6-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK6-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK6-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK6-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK6-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK6: .execute: +// CHECK6-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK6-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK6-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK6-NEXT: call void @__omp_outlined__8(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [10 x [10 x i32]]* [[TMP0]]) #[[ATTR2]] +// CHECK6-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK6: .omp.deinit: +// CHECK6-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK6-NEXT: br label [[DOTEXIT:%.*]] +// CHECK6: .exit: +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__8 +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK6-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK6-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[I8:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[J9:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK6-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK6-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK6-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK6-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK6-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK6-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK6-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 +// CHECK6-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK6-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK6-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 +// CHECK6-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], [[DIV5]] +// CHECK6-NEXT: [[SUB6:%.*]] = sub nsw i32 [[MUL]], 1 +// CHECK6-NEXT: store i32 [[SUB6]], i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[J]], align 4 +// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK6-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK6: land.lhs.true: +// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK6-NEXT: [[CMP7:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK6-NEXT: br i1 [[CMP7]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] +// CHECK6: omp.precond.then: +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK6-NEXT: store i32 [[TMP7]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK6-NEXT: [[TMP8:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP9]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK6-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP10]], [[TMP11]] +// CHECK6-NEXT: br i1 [[CMP10]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK6: cond.true: +// CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK6-NEXT: br label [[COND_END:%.*]] +// CHECK6: cond.false: +// CHECK6-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: br label [[COND_END]] +// CHECK6: cond.end: +// CHECK6-NEXT: [[COND:%.*]] = phi i32 [ [[TMP12]], [[COND_TRUE]] ], [ [[TMP13]], [[COND_FALSE]] ] +// CHECK6-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP14]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK6: omp.inner.for.cond: +// CHECK6-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], 1 +// CHECK6-NEXT: [[CMP11:%.*]] = icmp slt i32 [[TMP15]], [[ADD]] +// CHECK6-NEXT: br i1 [[CMP11]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK6: omp.inner.for.body: +// CHECK6-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP18:%.*]] = zext i32 [[TMP17]] to i64 +// CHECK6-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP20:%.*]] = zext i32 [[TMP19]] to i64 +// CHECK6-NEXT: [[TMP21:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK6-NEXT: [[CONV12:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK6-NEXT: store i32 [[TMP21]], i32* [[CONV12]], align 4 +// CHECK6-NEXT: [[TMP22:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK6-NEXT: [[TMP23:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK6-NEXT: [[TMP24:%.*]] = inttoptr i64 [[TMP18]] to i8* +// CHECK6-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 8 +// CHECK6-NEXT: [[TMP25:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK6-NEXT: [[TMP26:%.*]] = inttoptr i64 [[TMP20]] to i8* +// CHECK6-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 8 +// CHECK6-NEXT: [[TMP27:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK6-NEXT: [[TMP28:%.*]] = inttoptr i64 [[TMP22]] to i8* +// CHECK6-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 8 +// CHECK6-NEXT: [[TMP29:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK6-NEXT: [[TMP30:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK6-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 8 +// CHECK6-NEXT: [[TMP31:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK6-NEXT: [[TMP32:%.*]] = load i32, i32* [[TMP31]], align 4 +// CHECK6-NEXT: [[TMP33:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP32]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [10 x [10 x i32]]*)* @__omp_outlined__9 to i8*), i8* null, i8** [[TMP33]], i64 4) +// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK6: omp.inner.for.inc: +// CHECK6-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD13:%.*]] = add nsw i32 [[TMP34]], [[TMP35]] +// CHECK6-NEXT: store i32 [[ADD13]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP36]], [[TMP37]] +// CHECK6-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD15:%.*]] = add nsw i32 [[TMP38]], [[TMP39]] +// CHECK6-NEXT: store i32 [[ADD15]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK6-NEXT: [[CMP16:%.*]] = icmp sgt i32 [[TMP40]], [[TMP41]] +// CHECK6-NEXT: br i1 [[CMP16]], label [[COND_TRUE17:%.*]], label [[COND_FALSE18:%.*]] +// CHECK6: cond.true17: +// CHECK6-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK6-NEXT: br label [[COND_END19:%.*]] +// CHECK6: cond.false18: +// CHECK6-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: br label [[COND_END19]] +// CHECK6: cond.end19: +// CHECK6-NEXT: [[COND20:%.*]] = phi i32 [ [[TMP42]], [[COND_TRUE17]] ], [ [[TMP43]], [[COND_FALSE18]] ] +// CHECK6-NEXT: store i32 [[COND20]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP44]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK6: omp.inner.for.end: +// CHECK6-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK6: omp.loop.exit: +// CHECK6-NEXT: [[TMP45:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK6-NEXT: [[TMP46:%.*]] = load i32, i32* [[TMP45]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP46]]) +// CHECK6-NEXT: br label [[OMP_PRECOND_END]] +// CHECK6: omp.precond.end: +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__9 +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK6-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK6-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK6-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK6-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[I10:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[J11:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK6-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK6-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK6-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK6-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK6-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK6-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK6-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK6-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 +// CHECK6-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK6-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK6-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 +// CHECK6-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], [[DIV5]] +// CHECK6-NEXT: [[SUB6:%.*]] = sub nsw i32 [[MUL]], 1 +// CHECK6-NEXT: store i32 [[SUB6]], i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[J]], align 4 +// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK6-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK6: land.lhs.true: +// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK6-NEXT: [[CMP7:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK6-NEXT: br i1 [[CMP7]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] +// CHECK6: omp.precond.then: +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK6-NEXT: store i32 [[TMP7]], i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK6-NEXT: [[CONV8:%.*]] = trunc i64 [[TMP8]] to i32 +// CHECK6-NEXT: [[TMP9:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK6-NEXT: [[CONV9:%.*]] = trunc i64 [[TMP9]] to i32 +// CHECK6-NEXT: store i32 [[CONV8]], i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: store i32 [[CONV9]], i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK6-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP11]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP12]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK6: omp.inner.for.cond: +// CHECK6-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[CONV12:%.*]] = sext i32 [[TMP13]] to i64 +// CHECK6-NEXT: [[TMP14:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK6-NEXT: [[CMP13:%.*]] = icmp ule i64 [[CONV12]], [[TMP14]] +// CHECK6-NEXT: br i1 [[CMP13]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK6: omp.inner.for.body: +// CHECK6-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK6-NEXT: [[SUB14:%.*]] = sub nsw i32 [[TMP16]], 0 +// CHECK6-NEXT: [[DIV15:%.*]] = sdiv i32 [[SUB14]], 1 +// CHECK6-NEXT: [[MUL16:%.*]] = mul nsw i32 1, [[DIV15]] +// CHECK6-NEXT: [[DIV17:%.*]] = sdiv i32 [[TMP15]], [[MUL16]] +// CHECK6-NEXT: [[MUL18:%.*]] = mul nsw i32 [[DIV17]], 1 +// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL18]] +// CHECK6-NEXT: store i32 [[ADD]], i32* [[I10]], align 4 +// CHECK6-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK6-NEXT: [[SUB19:%.*]] = sub nsw i32 [[TMP19]], 0 +// CHECK6-NEXT: [[DIV20:%.*]] = sdiv i32 [[SUB19]], 1 +// CHECK6-NEXT: [[MUL21:%.*]] = mul nsw i32 1, [[DIV20]] +// CHECK6-NEXT: [[DIV22:%.*]] = sdiv i32 [[TMP18]], [[MUL21]] +// CHECK6-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK6-NEXT: [[SUB23:%.*]] = sub nsw i32 [[TMP20]], 0 +// CHECK6-NEXT: [[DIV24:%.*]] = sdiv i32 [[SUB23]], 1 +// CHECK6-NEXT: [[MUL25:%.*]] = mul nsw i32 1, [[DIV24]] +// CHECK6-NEXT: [[MUL26:%.*]] = mul nsw i32 [[DIV22]], [[MUL25]] +// CHECK6-NEXT: [[SUB27:%.*]] = sub nsw i32 [[TMP17]], [[MUL26]] +// CHECK6-NEXT: [[MUL28:%.*]] = mul nsw i32 [[SUB27]], 1 +// CHECK6-NEXT: [[ADD29:%.*]] = add nsw i32 0, [[MUL28]] +// CHECK6-NEXT: store i32 [[ADD29]], i32* [[J11]], align 4 +// CHECK6-NEXT: [[TMP21:%.*]] = load i32, i32* [[I10]], align 4 +// CHECK6-NEXT: [[TMP22:%.*]] = load i32, i32* [[J11]], align 4 +// CHECK6-NEXT: [[ADD30:%.*]] = add nsw i32 [[TMP21]], [[TMP22]] +// CHECK6-NEXT: [[TMP23:%.*]] = load i32, i32* [[I10]], align 4 +// CHECK6-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP23]] to i64 +// CHECK6-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK6-NEXT: [[TMP24:%.*]] = load i32, i32* [[J11]], align 4 +// CHECK6-NEXT: [[IDXPROM31:%.*]] = sext i32 [[TMP24]] to i64 +// CHECK6-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i64 0, i64 [[IDXPROM31]] +// CHECK6-NEXT: store i32 [[ADD30]], i32* [[ARRAYIDX32]], align 4 +// CHECK6-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK6: omp.body.continue: +// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK6: omp.inner.for.inc: +// CHECK6-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD33:%.*]] = add nsw i32 [[TMP25]], [[TMP26]] +// CHECK6-NEXT: store i32 [[ADD33]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK6: omp.inner.for.end: +// CHECK6-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK6: omp.loop.exit: +// CHECK6-NEXT: [[TMP27:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK6-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP27]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP28]]) +// CHECK6-NEXT: br label [[OMP_PRECOND_END]] +// CHECK6: omp.precond.end: +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l66 +// CHECK6-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK6-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 8 +// CHECK6-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK6-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK6-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK6-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 8 +// CHECK6-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK6-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK6-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK6-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK6: .execute: +// CHECK6-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK6-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK6-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK6-NEXT: [[TMP4:%.*]] = load i32*, i32** [[V_ADDR]], align 8 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK6-NEXT: call void @__omp_outlined__10(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i32]* [[TMP0]], i32* [[TMP4]]) #[[ATTR2]] +// CHECK6-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK6: .omp.deinit: +// CHECK6-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK6-NEXT: br label [[DOTEXIT:%.*]] +// CHECK6: .exit: +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__10 +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK6-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK6-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 8 +// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK6-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 8 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK6-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK6-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK6-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 8 +// CHECK6-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK6-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK6-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK6-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK6-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK6-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK6-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK6: omp.precond.then: +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK6-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK6-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK6-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK6-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK6: cond.true: +// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK6-NEXT: br label [[COND_END:%.*]] +// CHECK6: cond.false: +// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: br label [[COND_END]] +// CHECK6: cond.end: +// CHECK6-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK6-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK6: omp.inner.for.cond: +// CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK6-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK6-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK6: omp.inner.for.body: +// CHECK6-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK6-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 +// CHECK6-NEXT: [[TMP18:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK6-NEXT: [[CONV6:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK6-NEXT: store i32 [[TMP18]], i32* [[CONV6]], align 4 +// CHECK6-NEXT: [[TMP19:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK6-NEXT: [[TMP20:%.*]] = load i32*, i32** [[V_ADDR]], align 8 +// CHECK6-NEXT: [[TMP21:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK6-NEXT: [[TMP22:%.*]] = inttoptr i64 [[TMP15]] to i8* +// CHECK6-NEXT: store i8* [[TMP22]], i8** [[TMP21]], align 8 +// CHECK6-NEXT: [[TMP23:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK6-NEXT: [[TMP24:%.*]] = inttoptr i64 [[TMP17]] to i8* +// CHECK6-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 8 +// CHECK6-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK6-NEXT: [[TMP26:%.*]] = inttoptr i64 [[TMP19]] to i8* +// CHECK6-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 8 +// CHECK6-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK6-NEXT: [[TMP28:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK6-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 8 +// CHECK6-NEXT: [[TMP29:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 4 +// CHECK6-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP20]] to i8* +// CHECK6-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 8 +// CHECK6-NEXT: [[TMP31:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK6-NEXT: [[TMP32:%.*]] = load i32, i32* [[TMP31]], align 4 +// CHECK6-NEXT: [[TMP33:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP32]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i32]*, i32*)* @__omp_outlined__11 to i8*), i8* null, i8** [[TMP33]], i64 5) +// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK6: omp.inner.for.inc: +// CHECK6-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP34]], [[TMP35]] +// CHECK6-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP36]], [[TMP37]] +// CHECK6-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP38]], [[TMP39]] +// CHECK6-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK6-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP40]], [[TMP41]] +// CHECK6-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK6: cond.true11: +// CHECK6-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK6-NEXT: br label [[COND_END13:%.*]] +// CHECK6: cond.false12: +// CHECK6-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: br label [[COND_END13]] +// CHECK6: cond.end13: +// CHECK6-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP42]], [[COND_TRUE11]] ], [ [[TMP43]], [[COND_FALSE12]] ] +// CHECK6-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP44]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK6: omp.inner.for.end: +// CHECK6-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK6: omp.loop.exit: +// CHECK6-NEXT: [[TMP45:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK6-NEXT: [[TMP46:%.*]] = load i32, i32* [[TMP45]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP46]]) +// CHECK6-NEXT: br label [[OMP_PRECOND_END]] +// CHECK6: omp.precond.end: +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__11 +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK6-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK6-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK6-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK6-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 8 +// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK6-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK6-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK6-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK6-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK6-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 8 +// CHECK6-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK6-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK6-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK6-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK6-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK6-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK6-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK6: omp.precond.then: +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK6-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK6-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP5]] to i32 +// CHECK6-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK6-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP6]] to i32 +// CHECK6-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK6-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK6: omp.inner.for.cond: +// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[CONV6:%.*]] = sext i32 [[TMP10]] to i64 +// CHECK6-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK6-NEXT: [[CMP7:%.*]] = icmp ule i64 [[CONV6]], [[TMP11]] +// CHECK6-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK6: omp.inner.for.body: +// CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK6-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 +// CHECK6-NEXT: [[TMP13:%.*]] = load i32*, i32** [[V_ADDR]], align 8 +// CHECK6-NEXT: [[TMP14:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK6-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP14]] to i64 +// CHECK6-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TMP13]], i64 [[IDXPROM]] +// CHECK6-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK6-NEXT: [[TMP16:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK6-NEXT: [[IDXPROM8:%.*]] = sext i32 [[TMP16]] to i64 +// CHECK6-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM8]] +// CHECK6-NEXT: store i32 [[TMP15]], i32* [[ARRAYIDX9]], align 4 +// CHECK6-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK6: omp.body.continue: +// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK6: omp.inner.for.inc: +// CHECK6-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK6-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK6: omp.inner.for.end: +// CHECK6-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK6: omp.loop.exit: +// CHECK6-NEXT: [[TMP19:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK6-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP20]]) +// CHECK6-NEXT: br label [[OMP_PRECOND_END]] +// CHECK6: omp.precond.end: +// CHECK6-NEXT: ret void +// CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l35 +// CHECK7-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK7-NEXT: entry: +// CHECK7-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK7-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK7-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK7-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK7-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14:![0-9]+]] +// CHECK7-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK7-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK7: .execute: +// CHECK7-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK7-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK7-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK7-NEXT: [[TMP4:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[TMP4]], i32* [[L_CASTED]], align 4 +// CHECK7-NEXT: [[TMP5:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK7-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK7-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32 [[TMP5]]) #[[ATTR2:[0-9]+]] +// CHECK7-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK7: .omp.deinit: +// CHECK7-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK7-NEXT: br label [[DOTEXIT:%.*]] +// CHECK7: .exit: +// CHECK7-NEXT: ret void +// CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK7-NEXT: entry: +// CHECK7-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK7-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK7-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK7-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[I4:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 +// CHECK7-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK7-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK7-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK7-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK7-NEXT: [[L1:%.*]] = call i8* @__kmpc_alloc_shared(i32 4) +// CHECK7-NEXT: [[L_ON_STACK:%.*]] = bitcast i8* [[L1]] to i32* +// CHECK7-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK7-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK7-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK7-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK7-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK7-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK7-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK7-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK7-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK7-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK7: omp.precond.then: +// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK7-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK7-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK7-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) +// CHECK7-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK7-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK7-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK7: cond.true: +// CHECK7-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK7-NEXT: br label [[COND_END:%.*]] +// CHECK7: cond.false: +// CHECK7-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: br label [[COND_END]] +// CHECK7: cond.end: +// CHECK7-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK7-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK7: omp.inner.for.cond: +// CHECK7-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK7-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK7-NEXT: [[CMP6:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK7-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK7: omp.inner.for.body: +// CHECK7-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 +// CHECK7-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK7-NEXT: [[TMP18:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[TMP18]], i32* [[L_CASTED]], align 4 +// CHECK7-NEXT: [[TMP19:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK7-NEXT: [[TMP20:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK7-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK7-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 +// CHECK7-NEXT: [[TMP22:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK7-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK7-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 +// CHECK7-NEXT: [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK7-NEXT: [[TMP25:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK7-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 +// CHECK7-NEXT: [[TMP26:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK7-NEXT: [[TMP27:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK7-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 4 +// CHECK7-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK7-NEXT: [[TMP29:%.*]] = inttoptr i32 [[TMP19]] to i8* +// CHECK7-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 4 +// CHECK7-NEXT: [[TMP30:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: [[TMP31:%.*]] = load i32, i32* [[TMP30]], align 4 +// CHECK7-NEXT: [[TMP32:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK7-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP31]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP32]], i32 5) +// CHECK7-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK7: omp.inner.for.inc: +// CHECK7-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK7-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] +// CHECK7-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP37]], [[TMP38]] +// CHECK7-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK7-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP39]], [[TMP40]] +// CHECK7-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK7: cond.true11: +// CHECK7-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK7-NEXT: br label [[COND_END13:%.*]] +// CHECK7: cond.false12: +// CHECK7-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: br label [[COND_END13]] +// CHECK7: cond.end13: +// CHECK7-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP41]], [[COND_TRUE11]] ], [ [[TMP42]], [[COND_FALSE12]] ] +// CHECK7-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: store i32 [[TMP43]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK7: omp.inner.for.end: +// CHECK7-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK7: omp.loop.exit: +// CHECK7-NEXT: [[TMP44:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: [[TMP45:%.*]] = load i32, i32* [[TMP44]], align 4 +// CHECK7-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP45]]) +// CHECK7-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK7-NEXT: [[TMP47:%.*]] = icmp ne i32 [[TMP46]], 0 +// CHECK7-NEXT: br i1 [[TMP47]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK7: .omp.lastprivate.then: +// CHECK7-NEXT: [[TMP48:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[TMP48]], i32* [[L_ADDR]], align 4 +// CHECK7-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK7: .omp.lastprivate.done: +// CHECK7-NEXT: br label [[OMP_PRECOND_END]] +// CHECK7: omp.precond.end: +// CHECK7-NEXT: call void @__kmpc_free_shared(i8* [[L1]]) +// CHECK7-NEXT: ret void +// CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK7-NEXT: entry: +// CHECK7-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK7-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK7-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK7-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK7-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK7-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK7-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK7-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK7-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK7-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK7-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK7-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK7-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK7-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK7-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK7-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK7-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK7-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK7-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK7: omp.precond.then: +// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK7-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK7-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK7-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK7-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK7-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK7-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK7-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK7-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK7-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) +// CHECK7-NEXT: br label [[OMP_DISPATCH_COND:%.*]] +// CHECK7: omp.dispatch.cond: +// CHECK7-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK7-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK7-NEXT: [[CMP4:%.*]] = icmp ugt i32 [[TMP9]], [[TMP10]] +// CHECK7-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK7: cond.true: +// CHECK7-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK7-NEXT: br label [[COND_END:%.*]] +// CHECK7: cond.false: +// CHECK7-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK7-NEXT: br label [[COND_END]] +// CHECK7: cond.end: +// CHECK7-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] +// CHECK7-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 +// CHECK7-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK7-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK7-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] +// CHECK7-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] +// CHECK7: omp.dispatch.body: +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK7: omp.inner.for.cond: +// CHECK7-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK7-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] +// CHECK7-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK7: omp.inner.for.body: +// CHECK7-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK7-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK7-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK7-NEXT: [[TMP19:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK7-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP19]] +// CHECK7-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 +// CHECK7-NEXT: [[TMP20:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK7-NEXT: store i32 [[TMP20]], i32* [[L_ADDR]], align 4 +// CHECK7-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK7: omp.body.continue: +// CHECK7-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK7: omp.inner.for.inc: +// CHECK7-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP21]], 1 +// CHECK7-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK7: omp.inner.for.end: +// CHECK7-NEXT: br label [[OMP_DISPATCH_INC:%.*]] +// CHECK7: omp.dispatch.inc: +// CHECK7-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK7-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK7-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_LB]], align 4 +// CHECK7-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK7-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK7-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_UB]], align 4 +// CHECK7-NEXT: br label [[OMP_DISPATCH_COND]] +// CHECK7: omp.dispatch.end: +// CHECK7-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK7-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) +// CHECK7-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK7-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 +// CHECK7-NEXT: br i1 [[TMP29]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK7: .omp.lastprivate.then: +// CHECK7-NEXT: [[TMP30:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[TMP30]], i32* [[L_ADDR]], align 4 +// CHECK7-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK7: .omp.lastprivate.done: +// CHECK7-NEXT: br label [[OMP_PRECOND_END]] +// CHECK7: omp.precond.end: +// CHECK7-NEXT: ret void +// CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l41 +// CHECK7-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK7-NEXT: entry: +// CHECK7-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK7-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK7-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK7-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK7-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK7-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK7-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK7: .execute: +// CHECK7-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK7-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK7-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK7-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK7-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] +// CHECK7-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK7: .omp.deinit: +// CHECK7-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK7-NEXT: br label [[DOTEXIT:%.*]] +// CHECK7: .exit: +// CHECK7-NEXT: ret void +// CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK7-NEXT: entry: +// CHECK7-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK7-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK7-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK7-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK7-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK7-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK7-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK7-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK7-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK7-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK7-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK7-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK7-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK7-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK7-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK7-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK7-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK7-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK7: omp.precond.then: +// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK7-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK7-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK7-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK7-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK7-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK7-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK7-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK7: cond.true: +// CHECK7-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK7-NEXT: br label [[COND_END:%.*]] +// CHECK7: cond.false: +// CHECK7-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: br label [[COND_END]] +// CHECK7: cond.end: +// CHECK7-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK7-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK7: omp.inner.for.cond: +// CHECK7-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK7-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK7-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK7-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK7: omp.inner.for.body: +// CHECK7-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 +// CHECK7-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK7-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK7-NEXT: [[TMP19:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK7-NEXT: store i8* [[TMP19]], i8** [[TMP18]], align 4 +// CHECK7-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK7-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK7-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 +// CHECK7-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK7-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK7-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 +// CHECK7-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK7-NEXT: [[TMP25:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* +// CHECK7-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 +// CHECK7-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK7-NEXT: [[TMP28:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK7-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP27]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP28]], i32 4) +// CHECK7-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK7: omp.inner.for.inc: +// CHECK7-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP29]], [[TMP30]] +// CHECK7-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] +// CHECK7-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK7-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK7-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] +// CHECK7-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] +// CHECK7: cond.true10: +// CHECK7-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK7-NEXT: br label [[COND_END12:%.*]] +// CHECK7: cond.false11: +// CHECK7-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: br label [[COND_END12]] +// CHECK7: cond.end12: +// CHECK7-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP37]], [[COND_TRUE10]] ], [ [[TMP38]], [[COND_FALSE11]] ] +// CHECK7-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: store i32 [[TMP39]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK7: omp.inner.for.end: +// CHECK7-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK7: omp.loop.exit: +// CHECK7-NEXT: [[TMP40:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 +// CHECK7-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]]) +// CHECK7-NEXT: br label [[OMP_PRECOND_END]] +// CHECK7: omp.precond.end: +// CHECK7-NEXT: ret void +// CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK7-NEXT: entry: +// CHECK7-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK7-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK7-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK7-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK7-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK7-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK7-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK7-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK7-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK7-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK7-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK7-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK7-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK7-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK7-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK7-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK7-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK7-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK7-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK7: omp.precond.then: +// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK7-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK7-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK7-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK7-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK7-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK7-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK7-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK7-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK7-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK7-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK7-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK7: omp.inner.for.cond: +// CHECK7-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK7-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] +// CHECK7-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK7: omp.inner.for.body: +// CHECK7-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK7-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK7-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK7-NEXT: [[TMP13:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK7-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i32 0, i32 [[TMP13]] +// CHECK7-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +// CHECK7-NEXT: [[CONV:%.*]] = sext i16 [[TMP14]] to i32 +// CHECK7-NEXT: [[ADD5:%.*]] = add nsw i32 [[CONV]], 1 +// CHECK7-NEXT: [[CONV6:%.*]] = trunc i32 [[ADD5]] to i16 +// CHECK7-NEXT: store i16 [[CONV6]], i16* [[ARRAYIDX]], align 2 +// CHECK7-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK7: omp.body.continue: +// CHECK7-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK7: omp.inner.for.inc: +// CHECK7-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] +// CHECK7-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK7: omp.inner.for.end: +// CHECK7-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK7: omp.loop.exit: +// CHECK7-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 +// CHECK7-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) +// CHECK7-NEXT: br label [[OMP_PRECOND_END]] +// CHECK7: omp.precond.end: +// CHECK7-NEXT: ret void +// CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46 +// CHECK7-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK7-NEXT: entry: +// CHECK7-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK7-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK7-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK7-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK7-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK7-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK7-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK7: .execute: +// CHECK7-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK7-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK7-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] +// CHECK7-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK7: .omp.deinit: +// CHECK7-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK7-NEXT: br label [[DOTEXIT:%.*]] +// CHECK7: .exit: +// CHECK7-NEXT: ret void +// CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK7-NEXT: entry: +// CHECK7-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK7-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK7-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK7-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 +// CHECK7-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK7-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK7-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK7-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK7-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK7-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK7-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 +// CHECK7-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK7: cond.true: +// CHECK7-NEXT: br label [[COND_END:%.*]] +// CHECK7: cond.false: +// CHECK7-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: br label [[COND_END]] +// CHECK7: cond.end: +// CHECK7-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK7-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK7: omp.inner.for.cond: +// CHECK7-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 +// CHECK7-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK7: omp.inner.for.body: +// CHECK7-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK7-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK7-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK7-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK7-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK7-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK7-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK7-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK7-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK7-NEXT: [[TMP15:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK7-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP15]], i32 3) +// CHECK7-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK7: omp.inner.for.inc: +// CHECK7-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP17]] +// CHECK7-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] +// CHECK7-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK7-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP22]], 9 +// CHECK7-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] +// CHECK7: cond.true5: +// CHECK7-NEXT: br label [[COND_END7:%.*]] +// CHECK7: cond.false6: +// CHECK7-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: br label [[COND_END7]] +// CHECK7: cond.end7: +// CHECK7-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP23]], [[COND_FALSE6]] ] +// CHECK7-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: store i32 [[TMP24]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK7: omp.inner.for.end: +// CHECK7-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK7: omp.loop.exit: +// CHECK7-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK7-NEXT: ret void +// CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__5 +// CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK7-NEXT: entry: +// CHECK7-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK7-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK7-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK7-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK7-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK7-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK7-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK7-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK7-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK7-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK7-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK7-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK7-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK7-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK7-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK7-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK7-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK7-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK7: omp.inner.for.cond: +// CHECK7-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK7-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK7-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK7: omp.inner.for.body: +// CHECK7-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 +// CHECK7-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK7-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK7-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 +// CHECK7-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 [[TMP9]] +// CHECK7-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK7-NEXT: [[ADD1:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK7-NEXT: store i32 [[ADD1]], i32* [[ARRAYIDX]], align 4 +// CHECK7-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK7: omp.body.continue: +// CHECK7-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK7: omp.inner.for.inc: +// CHECK7-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] +// CHECK7-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK7: omp.inner.for.end: +// CHECK7-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK7: omp.loop.exit: +// CHECK7-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK7-NEXT: ret void +// CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l51 +// CHECK7-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK7-NEXT: entry: +// CHECK7-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK7-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK7-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK7-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK7-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK7-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK7-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK7: .execute: +// CHECK7-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK7-NEXT: [[TMP2:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[TMP2]], i32* [[F_CASTED]], align 4 +// CHECK7-NEXT: [[TMP3:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK7-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK7-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i32 [[TMP3]]) #[[ATTR2]] +// CHECK7-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK7: .omp.deinit: +// CHECK7-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK7-NEXT: br label [[DOTEXIT:%.*]] +// CHECK7: .exit: +// CHECK7-NEXT: ret void +// CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__6 +// CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK7-NEXT: entry: +// CHECK7-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK7-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK7-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK7-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK7-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK7-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK7-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK7-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK7-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK7-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK7-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 +// CHECK7-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK7: cond.true: +// CHECK7-NEXT: br label [[COND_END:%.*]] +// CHECK7: cond.false: +// CHECK7-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: br label [[COND_END]] +// CHECK7: cond.end: +// CHECK7-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK7-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK7: omp.inner.for.cond: +// CHECK7-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 +// CHECK7-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK7: omp.inner.for.body: +// CHECK7-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP9:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[TMP9]], i32* [[F_CASTED]], align 4 +// CHECK7-NEXT: [[TMP10:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK7-NEXT: [[TMP11:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK7-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK7-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK7-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK7-NEXT: [[TMP14:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK7-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK7-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK7-NEXT: [[TMP16:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK7-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 4 +// CHECK7-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK7-NEXT: [[TMP18:%.*]] = inttoptr i32 [[TMP10]] to i8* +// CHECK7-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 4 +// CHECK7-NEXT: [[TMP19:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK7-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x [10 x i32]]*, i32)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP19]], i32 4) +// CHECK7-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK7: omp.inner.for.inc: +// CHECK7-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK7-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK7-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK7-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP26]], 99 +// CHECK7-NEXT: br i1 [[CMP5]], label [[COND_TRUE6:%.*]], label [[COND_FALSE7:%.*]] +// CHECK7: cond.true6: +// CHECK7-NEXT: br label [[COND_END8:%.*]] +// CHECK7: cond.false7: +// CHECK7-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: br label [[COND_END8]] +// CHECK7: cond.end8: +// CHECK7-NEXT: [[COND9:%.*]] = phi i32 [ 99, [[COND_TRUE6]] ], [ [[TMP27]], [[COND_FALSE7]] ] +// CHECK7-NEXT: store i32 [[COND9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: store i32 [[TMP28]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK7: omp.inner.for.end: +// CHECK7-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK7: omp.loop.exit: +// CHECK7-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK7-NEXT: ret void +// CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__7 +// CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK7-NEXT: entry: +// CHECK7-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK7-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK7-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK7-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK7-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK7-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK7-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK7-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK7-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 +// CHECK7-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK7-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK7-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK7-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK7-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK7-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK7-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK7-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK7-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK7: omp.inner.for.cond: +// CHECK7-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK7-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK7-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK7: omp.inner.for.body: +// CHECK7-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 +// CHECK7-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 +// CHECK7-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK7-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK7-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[DIV2:%.*]] = sdiv i32 [[TMP10]], 10 +// CHECK7-NEXT: [[MUL3:%.*]] = mul nsw i32 [[DIV2]], 10 +// CHECK7-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL3]] +// CHECK7-NEXT: [[MUL4:%.*]] = mul nsw i32 [[SUB]], 1 +// CHECK7-NEXT: [[ADD5:%.*]] = add nsw i32 0, [[MUL4]] +// CHECK7-NEXT: store i32 [[ADD5]], i32* [[J]], align 4 +// CHECK7-NEXT: store i32 10, i32* [[K]], align 4 +// CHECK7-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 +// CHECK7-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 +// CHECK7-NEXT: [[TMP13:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK7-NEXT: [[MUL6:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] +// CHECK7-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP11]], [[MUL6]] +// CHECK7-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 +// CHECK7-NEXT: [[ADD8:%.*]] = add nsw i32 [[ADD7]], [[TMP14]] +// CHECK7-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 +// CHECK7-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP15]] +// CHECK7-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 +// CHECK7-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP16]] +// CHECK7-NEXT: store i32 [[ADD8]], i32* [[ARRAYIDX9]], align 4 +// CHECK7-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK7: omp.body.continue: +// CHECK7-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK7: omp.inner.for.inc: +// CHECK7-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK7-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK7: omp.inner.for.end: +// CHECK7-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK7: omp.loop.exit: +// CHECK7-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK7-NEXT: ret void +// CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59 +// CHECK7-SAME: (i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK7-NEXT: entry: +// CHECK7-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK7-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK7-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK7-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK7-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK7-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK7-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK7: .execute: +// CHECK7-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK7-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK7-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK7-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK7-NEXT: call void @__omp_outlined__8(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [10 x [10 x i32]]* [[TMP0]]) #[[ATTR2]] +// CHECK7-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK7: .omp.deinit: +// CHECK7-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK7-NEXT: br label [[DOTEXIT:%.*]] +// CHECK7: .exit: +// CHECK7-NEXT: ret void +// CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__8 +// CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK7-NEXT: entry: +// CHECK7-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK7-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK7-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK7-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 +// CHECK7-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 +// CHECK7-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i64, align 8 +// CHECK7-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i64, align 8 +// CHECK7-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 +// CHECK7-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[I9:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[J10:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK7-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK7-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK7-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK7-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK7-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK7-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK7-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK7-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 +// CHECK7-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK7-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64 +// CHECK7-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK7-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK7-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 +// CHECK7-NEXT: [[CONV6:%.*]] = sext i32 [[DIV5]] to i64 +// CHECK7-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV6]] +// CHECK7-NEXT: [[SUB7:%.*]] = sub nsw i64 [[MUL]], 1 +// CHECK7-NEXT: store i64 [[SUB7]], i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK7-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK7-NEXT: store i32 0, i32* [[J]], align 4 +// CHECK7-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK7-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK7-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK7: land.lhs.true: +// CHECK7-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK7-NEXT: [[CMP8:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK7-NEXT: br i1 [[CMP8]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] +// CHECK7: omp.precond.then: +// CHECK7-NEXT: store i64 0, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK7-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK7-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK7-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK7-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK7-NEXT: [[CONV11:%.*]] = zext i32 [[NVPTX_NUM_THREADS]] to i64 +// CHECK7-NEXT: [[TMP8:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4 +// CHECK7-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB1]], i32 [[TMP9]], i32 91, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_COMB_LB]], i64* [[DOTOMP_COMB_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 [[CONV11]]) +// CHECK7-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK7-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK7-NEXT: [[CMP12:%.*]] = icmp sgt i64 [[TMP10]], [[TMP11]] +// CHECK7-NEXT: br i1 [[CMP12]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK7: cond.true: +// CHECK7-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK7-NEXT: br label [[COND_END:%.*]] +// CHECK7: cond.false: +// CHECK7-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK7-NEXT: br label [[COND_END]] +// CHECK7: cond.end: +// CHECK7-NEXT: [[COND:%.*]] = phi i64 [ [[TMP12]], [[COND_TRUE]] ], [ [[TMP13]], [[COND_FALSE]] ] +// CHECK7-NEXT: store i64 [[COND]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK7-NEXT: [[TMP14:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK7-NEXT: store i64 [[TMP14]], i64* [[DOTOMP_IV]], align 8 +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK7: omp.inner.for.cond: +// CHECK7-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK7-NEXT: [[TMP16:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK7-NEXT: [[ADD:%.*]] = add nsw i64 [[TMP16]], 1 +// CHECK7-NEXT: [[CMP13:%.*]] = icmp slt i64 [[TMP15]], [[ADD]] +// CHECK7-NEXT: br i1 [[CMP13]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK7: omp.inner.for.body: +// CHECK7-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK7-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32 +// CHECK7-NEXT: [[TMP19:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK7-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32 +// CHECK7-NEXT: [[TMP21:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[TMP21]], i32* [[N_CASTED]], align 4 +// CHECK7-NEXT: [[TMP22:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK7-NEXT: [[TMP23:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK7-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP18]] to i8* +// CHECK7-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 +// CHECK7-NEXT: [[TMP25:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK7-NEXT: [[TMP26:%.*]] = inttoptr i32 [[TMP20]] to i8* +// CHECK7-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 +// CHECK7-NEXT: [[TMP27:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK7-NEXT: [[TMP28:%.*]] = inttoptr i32 [[TMP22]] to i8* +// CHECK7-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 +// CHECK7-NEXT: [[TMP29:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK7-NEXT: [[TMP30:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK7-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 4 +// CHECK7-NEXT: [[TMP31:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: [[TMP32:%.*]] = load i32, i32* [[TMP31]], align 4 +// CHECK7-NEXT: [[TMP33:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK7-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP32]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [10 x [10 x i32]]*)* @__omp_outlined__9 to i8*), i8* null, i8** [[TMP33]], i32 4) +// CHECK7-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK7: omp.inner.for.inc: +// CHECK7-NEXT: [[TMP34:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK7-NEXT: [[TMP35:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK7-NEXT: [[ADD14:%.*]] = add nsw i64 [[TMP34]], [[TMP35]] +// CHECK7-NEXT: store i64 [[ADD14]], i64* [[DOTOMP_IV]], align 8 +// CHECK7-NEXT: [[TMP36:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK7-NEXT: [[TMP37:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK7-NEXT: [[ADD15:%.*]] = add nsw i64 [[TMP36]], [[TMP37]] +// CHECK7-NEXT: store i64 [[ADD15]], i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK7-NEXT: [[TMP38:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK7-NEXT: [[TMP39:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK7-NEXT: [[ADD16:%.*]] = add nsw i64 [[TMP38]], [[TMP39]] +// CHECK7-NEXT: store i64 [[ADD16]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK7-NEXT: [[TMP40:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK7-NEXT: [[TMP41:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK7-NEXT: [[CMP17:%.*]] = icmp sgt i64 [[TMP40]], [[TMP41]] +// CHECK7-NEXT: br i1 [[CMP17]], label [[COND_TRUE18:%.*]], label [[COND_FALSE19:%.*]] +// CHECK7: cond.true18: +// CHECK7-NEXT: [[TMP42:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK7-NEXT: br label [[COND_END20:%.*]] +// CHECK7: cond.false19: +// CHECK7-NEXT: [[TMP43:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK7-NEXT: br label [[COND_END20]] +// CHECK7: cond.end20: +// CHECK7-NEXT: [[COND21:%.*]] = phi i64 [ [[TMP42]], [[COND_TRUE18]] ], [ [[TMP43]], [[COND_FALSE19]] ] +// CHECK7-NEXT: store i64 [[COND21]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK7-NEXT: [[TMP44:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK7-NEXT: store i64 [[TMP44]], i64* [[DOTOMP_IV]], align 8 +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK7: omp.inner.for.end: +// CHECK7-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK7: omp.loop.exit: +// CHECK7-NEXT: [[TMP45:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: [[TMP46:%.*]] = load i32, i32* [[TMP45]], align 4 +// CHECK7-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP46]]) +// CHECK7-NEXT: br label [[OMP_PRECOND_END]] +// CHECK7: omp.precond.end: +// CHECK7-NEXT: ret void +// CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__9 +// CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK7-NEXT: entry: +// CHECK7-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK7-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK7-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK7-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 +// CHECK7-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 +// CHECK7-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_LB:%.*]] = alloca i64, align 8 +// CHECK7-NEXT: [[DOTOMP_UB:%.*]] = alloca i64, align 8 +// CHECK7-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 +// CHECK7-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[I11:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[J12:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK7-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK7-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK7-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK7-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK7-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK7-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK7-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK7-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK7-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 +// CHECK7-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK7-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64 +// CHECK7-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK7-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK7-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 +// CHECK7-NEXT: [[CONV6:%.*]] = sext i32 [[DIV5]] to i64 +// CHECK7-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV6]] +// CHECK7-NEXT: [[SUB7:%.*]] = sub nsw i64 [[MUL]], 1 +// CHECK7-NEXT: store i64 [[SUB7]], i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK7-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK7-NEXT: store i32 0, i32* [[J]], align 4 +// CHECK7-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK7-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK7-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK7: land.lhs.true: +// CHECK7-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK7-NEXT: [[CMP8:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK7-NEXT: br i1 [[CMP8]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] +// CHECK7: omp.precond.then: +// CHECK7-NEXT: store i64 0, i64* [[DOTOMP_LB]], align 8 +// CHECK7-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK7-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_UB]], align 8 +// CHECK7-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK7-NEXT: [[CONV9:%.*]] = zext i32 [[TMP8]] to i64 +// CHECK7-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK7-NEXT: [[CONV10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK7-NEXT: store i64 [[CONV9]], i64* [[DOTOMP_LB]], align 8 +// CHECK7-NEXT: store i64 [[CONV10]], i64* [[DOTOMP_UB]], align 8 +// CHECK7-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK7-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK7-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB2]], i32 [[TMP11]], i32 33, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_LB]], i64* [[DOTOMP_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 1) +// CHECK7-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTOMP_LB]], align 8 +// CHECK7-NEXT: store i64 [[TMP12]], i64* [[DOTOMP_IV]], align 8 +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK7: omp.inner.for.cond: +// CHECK7-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK7-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK7-NEXT: [[CONV13:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK7-NEXT: [[CMP14:%.*]] = icmp sle i64 [[TMP13]], [[CONV13]] +// CHECK7-NEXT: br i1 [[CMP14]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK7: omp.inner.for.body: +// CHECK7-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK7-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK7-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP16]], 0 +// CHECK7-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1 +// CHECK7-NEXT: [[MUL17:%.*]] = mul nsw i32 1, [[DIV16]] +// CHECK7-NEXT: [[CONV18:%.*]] = sext i32 [[MUL17]] to i64 +// CHECK7-NEXT: [[DIV19:%.*]] = sdiv i64 [[TMP15]], [[CONV18]] +// CHECK7-NEXT: [[MUL20:%.*]] = mul nsw i64 [[DIV19]], 1 +// CHECK7-NEXT: [[ADD:%.*]] = add nsw i64 0, [[MUL20]] +// CHECK7-NEXT: [[CONV21:%.*]] = trunc i64 [[ADD]] to i32 +// CHECK7-NEXT: store i32 [[CONV21]], i32* [[I11]], align 4 +// CHECK7-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK7-NEXT: [[TMP18:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK7-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK7-NEXT: [[SUB22:%.*]] = sub nsw i32 [[TMP19]], 0 +// CHECK7-NEXT: [[DIV23:%.*]] = sdiv i32 [[SUB22]], 1 +// CHECK7-NEXT: [[MUL24:%.*]] = mul nsw i32 1, [[DIV23]] +// CHECK7-NEXT: [[CONV25:%.*]] = sext i32 [[MUL24]] to i64 +// CHECK7-NEXT: [[DIV26:%.*]] = sdiv i64 [[TMP18]], [[CONV25]] +// CHECK7-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK7-NEXT: [[SUB27:%.*]] = sub nsw i32 [[TMP20]], 0 +// CHECK7-NEXT: [[DIV28:%.*]] = sdiv i32 [[SUB27]], 1 +// CHECK7-NEXT: [[MUL29:%.*]] = mul nsw i32 1, [[DIV28]] +// CHECK7-NEXT: [[CONV30:%.*]] = sext i32 [[MUL29]] to i64 +// CHECK7-NEXT: [[MUL31:%.*]] = mul nsw i64 [[DIV26]], [[CONV30]] +// CHECK7-NEXT: [[SUB32:%.*]] = sub nsw i64 [[TMP17]], [[MUL31]] +// CHECK7-NEXT: [[MUL33:%.*]] = mul nsw i64 [[SUB32]], 1 +// CHECK7-NEXT: [[ADD34:%.*]] = add nsw i64 0, [[MUL33]] +// CHECK7-NEXT: [[CONV35:%.*]] = trunc i64 [[ADD34]] to i32 +// CHECK7-NEXT: store i32 [[CONV35]], i32* [[J12]], align 4 +// CHECK7-NEXT: [[TMP21:%.*]] = load i32, i32* [[I11]], align 4 +// CHECK7-NEXT: [[TMP22:%.*]] = load i32, i32* [[J12]], align 4 +// CHECK7-NEXT: [[ADD36:%.*]] = add nsw i32 [[TMP21]], [[TMP22]] +// CHECK7-NEXT: [[TMP23:%.*]] = load i32, i32* [[I11]], align 4 +// CHECK7-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP23]] +// CHECK7-NEXT: [[TMP24:%.*]] = load i32, i32* [[J12]], align 4 +// CHECK7-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP24]] +// CHECK7-NEXT: store i32 [[ADD36]], i32* [[ARRAYIDX37]], align 4 +// CHECK7-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK7: omp.body.continue: +// CHECK7-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK7: omp.inner.for.inc: +// CHECK7-NEXT: [[TMP25:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK7-NEXT: [[TMP26:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK7-NEXT: [[ADD38:%.*]] = add nsw i64 [[TMP25]], [[TMP26]] +// CHECK7-NEXT: store i64 [[ADD38]], i64* [[DOTOMP_IV]], align 8 +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK7: omp.inner.for.end: +// CHECK7-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK7: omp.loop.exit: +// CHECK7-NEXT: [[TMP27:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP27]], align 4 +// CHECK7-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP28]]) +// CHECK7-NEXT: br label [[OMP_PRECOND_END]] +// CHECK7: omp.precond.end: +// CHECK7-NEXT: ret void +// CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l66 +// CHECK7-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK7-NEXT: entry: +// CHECK7-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK7-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 +// CHECK7-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK7-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK7-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 +// CHECK7-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK7-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK7-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK7-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK7: .execute: +// CHECK7-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK7-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK7-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK7-NEXT: [[TMP4:%.*]] = load i32*, i32** [[V_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK7-NEXT: call void @__omp_outlined__10(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32* [[TMP4]]) #[[ATTR2]] +// CHECK7-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK7: .omp.deinit: +// CHECK7-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK7-NEXT: br label [[DOTEXIT:%.*]] +// CHECK7: .exit: +// CHECK7-NEXT: ret void +// CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__10 +// CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK7-NEXT: entry: +// CHECK7-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK7-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK7-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK7-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 +// CHECK7-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 +// CHECK7-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK7-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK7-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK7-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 +// CHECK7-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK7-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK7-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK7-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK7-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK7-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK7-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK7-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK7-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK7-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK7-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK7: omp.precond.then: +// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK7-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK7-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK7-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK7-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK7-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK7-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK7-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK7: cond.true: +// CHECK7-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK7-NEXT: br label [[COND_END:%.*]] +// CHECK7: cond.false: +// CHECK7-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: br label [[COND_END]] +// CHECK7: cond.end: +// CHECK7-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK7-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK7: omp.inner.for.cond: +// CHECK7-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK7-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK7-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK7-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK7: omp.inner.for.body: +// CHECK7-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 +// CHECK7-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK7-NEXT: [[TMP18:%.*]] = load i32*, i32** [[V_ADDR]], align 4 +// CHECK7-NEXT: [[TMP19:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK7-NEXT: [[TMP20:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK7-NEXT: store i8* [[TMP20]], i8** [[TMP19]], align 4 +// CHECK7-NEXT: [[TMP21:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK7-NEXT: [[TMP22:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK7-NEXT: store i8* [[TMP22]], i8** [[TMP21]], align 4 +// CHECK7-NEXT: [[TMP23:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK7-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK7-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 +// CHECK7-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK7-NEXT: [[TMP26:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK7-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 +// CHECK7-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK7-NEXT: [[TMP28:%.*]] = bitcast i32* [[TMP18]] to i8* +// CHECK7-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 +// CHECK7-NEXT: [[TMP29:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: [[TMP30:%.*]] = load i32, i32* [[TMP29]], align 4 +// CHECK7-NEXT: [[TMP31:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK7-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP30]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32*)* @__omp_outlined__11 to i8*), i8* null, i8** [[TMP31]], i32 5) +// CHECK7-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK7: omp.inner.for.inc: +// CHECK7-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP32]], [[TMP33]] +// CHECK7-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP34]], [[TMP35]] +// CHECK7-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP36]], [[TMP37]] +// CHECK7-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK7-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP38]], [[TMP39]] +// CHECK7-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] +// CHECK7: cond.true10: +// CHECK7-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK7-NEXT: br label [[COND_END12:%.*]] +// CHECK7: cond.false11: +// CHECK7-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: br label [[COND_END12]] +// CHECK7: cond.end12: +// CHECK7-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP40]], [[COND_TRUE10]] ], [ [[TMP41]], [[COND_FALSE11]] ] +// CHECK7-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK7-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK7-NEXT: store i32 [[TMP42]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK7: omp.inner.for.end: +// CHECK7-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK7: omp.loop.exit: +// CHECK7-NEXT: [[TMP43:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: [[TMP44:%.*]] = load i32, i32* [[TMP43]], align 4 +// CHECK7-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP44]]) +// CHECK7-NEXT: br label [[OMP_PRECOND_END]] +// CHECK7: omp.precond.end: +// CHECK7-NEXT: ret void +// CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__11 +// CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK7-NEXT: entry: +// CHECK7-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK7-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK7-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK7-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 +// CHECK7-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK7-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK7-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK7-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK7-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK7-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 +// CHECK7-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK7-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK7-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK7-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK7-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK7-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK7-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK7-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK7-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK7-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK7-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK7: omp.precond.then: +// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK7-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK7-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK7-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK7-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK7-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK7-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK7-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK7-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK7-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK7-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK7-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK7: omp.inner.for.cond: +// CHECK7-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK7-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] +// CHECK7-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK7: omp.inner.for.body: +// CHECK7-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK7-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK7-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK7-NEXT: [[TMP13:%.*]] = load i32*, i32** [[V_ADDR]], align 4 +// CHECK7-NEXT: [[TMP14:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK7-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TMP13]], i32 [[TMP14]] +// CHECK7-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK7-NEXT: [[TMP16:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK7-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP16]] +// CHECK7-NEXT: store i32 [[TMP15]], i32* [[ARRAYIDX5]], align 4 +// CHECK7-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK7: omp.body.continue: +// CHECK7-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK7: omp.inner.for.inc: +// CHECK7-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK7-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK7-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK7: omp.inner.for.end: +// CHECK7-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK7: omp.loop.exit: +// CHECK7-NEXT: [[TMP19:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4 +// CHECK7-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP20]]) +// CHECK7-NEXT: br label [[OMP_PRECOND_END]] +// CHECK7: omp.precond.end: +// CHECK7-NEXT: ret void +// CHECK8-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l35 +// CHECK8-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK8-NEXT: entry: +// CHECK8-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK8-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK8-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK8-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK8-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK8-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK8-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14:![0-9]+]] +// CHECK8-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK8-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK8: .execute: +// CHECK8-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK8-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK8-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK8-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK8-NEXT: [[TMP4:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK8-NEXT: store i32 [[TMP4]], i32* [[L_CASTED]], align 4 +// CHECK8-NEXT: [[TMP5:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK8-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK8-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32 [[TMP5]]) #[[ATTR2:[0-9]+]] +// CHECK8-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK8: .omp.deinit: +// CHECK8-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK8-NEXT: br label [[DOTEXIT:%.*]] +// CHECK8: .exit: +// CHECK8-NEXT: ret void +// CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK8-NEXT: entry: +// CHECK8-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK8-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK8-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK8-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[I4:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 +// CHECK8-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK8-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK8-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK8-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK8-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK8-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK8-NEXT: [[L1:%.*]] = call i8* @__kmpc_alloc_shared(i32 4) +// CHECK8-NEXT: [[L_ON_STACK:%.*]] = bitcast i8* [[L1]] to i32* +// CHECK8-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK8-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK8-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK8-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK8-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK8-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK8-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK8-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK8: omp.precond.then: +// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK8-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK8-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK8-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK8-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) +// CHECK8-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK8-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK8-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK8: cond.true: +// CHECK8-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK8-NEXT: br label [[COND_END:%.*]] +// CHECK8: cond.false: +// CHECK8-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: br label [[COND_END]] +// CHECK8: cond.end: +// CHECK8-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK8-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK8: omp.inner.for.cond: +// CHECK8-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK8-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK8-NEXT: [[CMP6:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK8-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK8: omp.inner.for.body: +// CHECK8-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK8-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 +// CHECK8-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK8-NEXT: [[TMP18:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK8-NEXT: store i32 [[TMP18]], i32* [[L_CASTED]], align 4 +// CHECK8-NEXT: [[TMP19:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK8-NEXT: [[TMP20:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK8-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK8-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 +// CHECK8-NEXT: [[TMP22:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK8-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK8-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 +// CHECK8-NEXT: [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK8-NEXT: [[TMP25:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK8-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 +// CHECK8-NEXT: [[TMP26:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK8-NEXT: [[TMP27:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK8-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 4 +// CHECK8-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK8-NEXT: [[TMP29:%.*]] = inttoptr i32 [[TMP19]] to i8* +// CHECK8-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 4 +// CHECK8-NEXT: [[TMP30:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK8-NEXT: [[TMP31:%.*]] = load i32, i32* [[TMP30]], align 4 +// CHECK8-NEXT: [[TMP32:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK8-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP31]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP32]], i32 5) +// CHECK8-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK8: omp.inner.for.inc: +// CHECK8-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK8-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] +// CHECK8-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP37]], [[TMP38]] +// CHECK8-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK8-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP39]], [[TMP40]] +// CHECK8-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK8: cond.true11: +// CHECK8-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK8-NEXT: br label [[COND_END13:%.*]] +// CHECK8: cond.false12: +// CHECK8-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: br label [[COND_END13]] +// CHECK8: cond.end13: +// CHECK8-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP41]], [[COND_TRUE11]] ], [ [[TMP42]], [[COND_FALSE12]] ] +// CHECK8-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: store i32 [[TMP43]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK8: omp.inner.for.end: +// CHECK8-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK8: omp.loop.exit: +// CHECK8-NEXT: [[TMP44:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK8-NEXT: [[TMP45:%.*]] = load i32, i32* [[TMP44]], align 4 +// CHECK8-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP45]]) +// CHECK8-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK8-NEXT: [[TMP47:%.*]] = icmp ne i32 [[TMP46]], 0 +// CHECK8-NEXT: br i1 [[TMP47]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK8: .omp.lastprivate.then: +// CHECK8-NEXT: [[TMP48:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK8-NEXT: store i32 [[TMP48]], i32* [[L_ADDR]], align 4 +// CHECK8-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK8: .omp.lastprivate.done: +// CHECK8-NEXT: br label [[OMP_PRECOND_END]] +// CHECK8: omp.precond.end: +// CHECK8-NEXT: call void @__kmpc_free_shared(i8* [[L1]]) +// CHECK8-NEXT: ret void +// CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK8-NEXT: entry: +// CHECK8-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK8-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK8-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK8-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK8-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK8-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK8-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK8-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK8-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK8-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK8-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK8-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK8-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK8-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK8-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK8-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK8-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK8-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK8-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK8: omp.precond.then: +// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK8-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK8-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK8-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK8-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK8-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK8-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK8-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK8-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK8-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK8-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) +// CHECK8-NEXT: br label [[OMP_DISPATCH_COND:%.*]] +// CHECK8: omp.dispatch.cond: +// CHECK8-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK8-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK8-NEXT: [[CMP4:%.*]] = icmp ugt i32 [[TMP9]], [[TMP10]] +// CHECK8-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK8: cond.true: +// CHECK8-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK8-NEXT: br label [[COND_END:%.*]] +// CHECK8: cond.false: +// CHECK8-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK8-NEXT: br label [[COND_END]] +// CHECK8: cond.end: +// CHECK8-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] +// CHECK8-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 +// CHECK8-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK8-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK8-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] +// CHECK8-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] +// CHECK8: omp.dispatch.body: +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK8: omp.inner.for.cond: +// CHECK8-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK8-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] +// CHECK8-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK8: omp.inner.for.body: +// CHECK8-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK8-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK8-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK8-NEXT: [[TMP19:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK8-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP19]] +// CHECK8-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 +// CHECK8-NEXT: [[TMP20:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK8-NEXT: store i32 [[TMP20]], i32* [[L_ADDR]], align 4 +// CHECK8-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK8: omp.body.continue: +// CHECK8-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK8: omp.inner.for.inc: +// CHECK8-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP21]], 1 +// CHECK8-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK8: omp.inner.for.end: +// CHECK8-NEXT: br label [[OMP_DISPATCH_INC:%.*]] +// CHECK8: omp.dispatch.inc: +// CHECK8-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK8-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK8-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_LB]], align 4 +// CHECK8-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK8-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK8-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_UB]], align 4 +// CHECK8-NEXT: br label [[OMP_DISPATCH_COND]] +// CHECK8: omp.dispatch.end: +// CHECK8-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK8-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK8-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) +// CHECK8-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK8-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 +// CHECK8-NEXT: br i1 [[TMP29]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK8: .omp.lastprivate.then: +// CHECK8-NEXT: [[TMP30:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK8-NEXT: store i32 [[TMP30]], i32* [[L_ADDR]], align 4 +// CHECK8-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK8: .omp.lastprivate.done: +// CHECK8-NEXT: br label [[OMP_PRECOND_END]] +// CHECK8: omp.precond.end: +// CHECK8-NEXT: ret void +// CHECK8-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l41 +// CHECK8-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK8-NEXT: entry: +// CHECK8-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK8-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK8-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK8-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK8-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK8-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK8-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK8-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK8: .execute: +// CHECK8-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK8-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK8-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK8-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK8-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK8-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] +// CHECK8-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK8: .omp.deinit: +// CHECK8-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK8-NEXT: br label [[DOTEXIT:%.*]] +// CHECK8: .exit: +// CHECK8-NEXT: ret void +// CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK8-NEXT: entry: +// CHECK8-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK8-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK8-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK8-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK8-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK8-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK8-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK8-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK8-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK8-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK8-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK8-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK8-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK8-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK8-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK8-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK8-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK8: omp.precond.then: +// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK8-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK8-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK8-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK8-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK8-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK8-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK8-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK8-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK8: cond.true: +// CHECK8-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK8-NEXT: br label [[COND_END:%.*]] +// CHECK8: cond.false: +// CHECK8-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: br label [[COND_END]] +// CHECK8: cond.end: +// CHECK8-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK8-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK8: omp.inner.for.cond: +// CHECK8-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK8-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK8-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK8-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK8: omp.inner.for.body: +// CHECK8-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK8-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 +// CHECK8-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK8-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK8-NEXT: [[TMP19:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK8-NEXT: store i8* [[TMP19]], i8** [[TMP18]], align 4 +// CHECK8-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK8-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK8-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 +// CHECK8-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK8-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK8-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 +// CHECK8-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK8-NEXT: [[TMP25:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* +// CHECK8-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 +// CHECK8-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK8-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK8-NEXT: [[TMP28:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK8-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP27]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP28]], i32 4) +// CHECK8-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK8: omp.inner.for.inc: +// CHECK8-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP29]], [[TMP30]] +// CHECK8-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] +// CHECK8-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK8-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK8-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] +// CHECK8-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] +// CHECK8: cond.true10: +// CHECK8-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK8-NEXT: br label [[COND_END12:%.*]] +// CHECK8: cond.false11: +// CHECK8-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: br label [[COND_END12]] +// CHECK8: cond.end12: +// CHECK8-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP37]], [[COND_TRUE10]] ], [ [[TMP38]], [[COND_FALSE11]] ] +// CHECK8-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: store i32 [[TMP39]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK8: omp.inner.for.end: +// CHECK8-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK8: omp.loop.exit: +// CHECK8-NEXT: [[TMP40:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK8-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 +// CHECK8-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]]) +// CHECK8-NEXT: br label [[OMP_PRECOND_END]] +// CHECK8: omp.precond.end: +// CHECK8-NEXT: ret void +// CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK8-NEXT: entry: +// CHECK8-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK8-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK8-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK8-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK8-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK8-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK8-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK8-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK8-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK8-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK8-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK8-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK8-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK8-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK8-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK8-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK8-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK8-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK8: omp.precond.then: +// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK8-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK8-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK8-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK8-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK8-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK8-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK8-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK8-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK8-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK8-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK8-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK8-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK8: omp.inner.for.cond: +// CHECK8-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK8-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] +// CHECK8-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK8: omp.inner.for.body: +// CHECK8-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK8-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK8-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK8-NEXT: [[TMP13:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK8-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i32 0, i32 [[TMP13]] +// CHECK8-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +// CHECK8-NEXT: [[CONV:%.*]] = sext i16 [[TMP14]] to i32 +// CHECK8-NEXT: [[ADD5:%.*]] = add nsw i32 [[CONV]], 1 +// CHECK8-NEXT: [[CONV6:%.*]] = trunc i32 [[ADD5]] to i16 +// CHECK8-NEXT: store i16 [[CONV6]], i16* [[ARRAYIDX]], align 2 +// CHECK8-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK8: omp.body.continue: +// CHECK8-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK8: omp.inner.for.inc: +// CHECK8-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] +// CHECK8-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK8: omp.inner.for.end: +// CHECK8-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK8: omp.loop.exit: +// CHECK8-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK8-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 +// CHECK8-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) +// CHECK8-NEXT: br label [[OMP_PRECOND_END]] +// CHECK8: omp.precond.end: +// CHECK8-NEXT: ret void +// CHECK8-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46 +// CHECK8-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK8-NEXT: entry: +// CHECK8-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK8-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK8-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK8-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK8-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK8-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK8-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK8: .execute: +// CHECK8-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK8-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK8-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] +// CHECK8-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK8: .omp.deinit: +// CHECK8-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK8-NEXT: br label [[DOTEXIT:%.*]] +// CHECK8: .exit: +// CHECK8-NEXT: ret void +// CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK8-NEXT: entry: +// CHECK8-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK8-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK8-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK8-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 +// CHECK8-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK8-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK8-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK8-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK8-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK8-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK8-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK8-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK8-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 +// CHECK8-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK8: cond.true: +// CHECK8-NEXT: br label [[COND_END:%.*]] +// CHECK8: cond.false: +// CHECK8-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: br label [[COND_END]] +// CHECK8: cond.end: +// CHECK8-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK8-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK8: omp.inner.for.cond: +// CHECK8-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 +// CHECK8-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK8: omp.inner.for.body: +// CHECK8-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK8-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK8-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK8-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK8-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK8-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK8-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK8-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK8-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK8-NEXT: [[TMP15:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK8-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP15]], i32 3) +// CHECK8-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK8: omp.inner.for.inc: +// CHECK8-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP17]] +// CHECK8-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] +// CHECK8-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK8-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP22]], 9 +// CHECK8-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] +// CHECK8: cond.true5: +// CHECK8-NEXT: br label [[COND_END7:%.*]] +// CHECK8: cond.false6: +// CHECK8-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: br label [[COND_END7]] +// CHECK8: cond.end7: +// CHECK8-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP23]], [[COND_FALSE6]] ] +// CHECK8-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: store i32 [[TMP24]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK8: omp.inner.for.end: +// CHECK8-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK8: omp.loop.exit: +// CHECK8-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK8-NEXT: ret void +// CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__5 +// CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK8-NEXT: entry: +// CHECK8-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK8-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK8-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK8-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK8-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK8-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK8-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK8-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK8-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK8-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK8-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK8-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK8-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK8-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK8-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK8-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK8-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK8-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK8-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK8-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK8: omp.inner.for.cond: +// CHECK8-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK8-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK8-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK8: omp.inner.for.body: +// CHECK8-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 +// CHECK8-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK8-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK8-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 +// CHECK8-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 [[TMP9]] +// CHECK8-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK8-NEXT: [[ADD1:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK8-NEXT: store i32 [[ADD1]], i32* [[ARRAYIDX]], align 4 +// CHECK8-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK8: omp.body.continue: +// CHECK8-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK8: omp.inner.for.inc: +// CHECK8-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] +// CHECK8-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK8: omp.inner.for.end: +// CHECK8-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK8: omp.loop.exit: +// CHECK8-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK8-NEXT: ret void +// CHECK8-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l51 +// CHECK8-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK8-NEXT: entry: +// CHECK8-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK8-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK8-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK8-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK8-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK8-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK8-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK8-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK8: .execute: +// CHECK8-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK8-NEXT: [[TMP2:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK8-NEXT: store i32 [[TMP2]], i32* [[F_CASTED]], align 4 +// CHECK8-NEXT: [[TMP3:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK8-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK8-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i32 [[TMP3]]) #[[ATTR2]] +// CHECK8-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK8: .omp.deinit: +// CHECK8-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK8-NEXT: br label [[DOTEXIT:%.*]] +// CHECK8: .exit: +// CHECK8-NEXT: ret void +// CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__6 +// CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK8-NEXT: entry: +// CHECK8-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK8-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK8-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK8-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK8-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK8-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK8-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK8-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK8-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK8-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK8-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK8-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK8-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK8-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 +// CHECK8-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK8: cond.true: +// CHECK8-NEXT: br label [[COND_END:%.*]] +// CHECK8: cond.false: +// CHECK8-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: br label [[COND_END]] +// CHECK8: cond.end: +// CHECK8-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK8-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK8: omp.inner.for.cond: +// CHECK8-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 +// CHECK8-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK8: omp.inner.for.body: +// CHECK8-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP9:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK8-NEXT: store i32 [[TMP9]], i32* [[F_CASTED]], align 4 +// CHECK8-NEXT: [[TMP10:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK8-NEXT: [[TMP11:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK8-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK8-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK8-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK8-NEXT: [[TMP14:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK8-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK8-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK8-NEXT: [[TMP16:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK8-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 4 +// CHECK8-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK8-NEXT: [[TMP18:%.*]] = inttoptr i32 [[TMP10]] to i8* +// CHECK8-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 4 +// CHECK8-NEXT: [[TMP19:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK8-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x [10 x i32]]*, i32)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP19]], i32 4) +// CHECK8-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK8: omp.inner.for.inc: +// CHECK8-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK8-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK8-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK8-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP26]], 99 +// CHECK8-NEXT: br i1 [[CMP5]], label [[COND_TRUE6:%.*]], label [[COND_FALSE7:%.*]] +// CHECK8: cond.true6: +// CHECK8-NEXT: br label [[COND_END8:%.*]] +// CHECK8: cond.false7: +// CHECK8-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: br label [[COND_END8]] +// CHECK8: cond.end8: +// CHECK8-NEXT: [[COND9:%.*]] = phi i32 [ 99, [[COND_TRUE6]] ], [ [[TMP27]], [[COND_FALSE7]] ] +// CHECK8-NEXT: store i32 [[COND9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: store i32 [[TMP28]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK8: omp.inner.for.end: +// CHECK8-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK8: omp.loop.exit: +// CHECK8-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK8-NEXT: ret void +// CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__7 +// CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK8-NEXT: entry: +// CHECK8-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK8-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK8-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK8-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK8-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK8-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK8-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK8-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK8-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK8-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK8-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 +// CHECK8-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK8-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK8-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK8-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK8-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK8-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK8-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK8-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK8-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK8-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK8: omp.inner.for.cond: +// CHECK8-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK8-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK8-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK8: omp.inner.for.body: +// CHECK8-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 +// CHECK8-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 +// CHECK8-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK8-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK8-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[DIV2:%.*]] = sdiv i32 [[TMP10]], 10 +// CHECK8-NEXT: [[MUL3:%.*]] = mul nsw i32 [[DIV2]], 10 +// CHECK8-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL3]] +// CHECK8-NEXT: [[MUL4:%.*]] = mul nsw i32 [[SUB]], 1 +// CHECK8-NEXT: [[ADD5:%.*]] = add nsw i32 0, [[MUL4]] +// CHECK8-NEXT: store i32 [[ADD5]], i32* [[J]], align 4 +// CHECK8-NEXT: store i32 10, i32* [[K]], align 4 +// CHECK8-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 +// CHECK8-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 +// CHECK8-NEXT: [[TMP13:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK8-NEXT: [[MUL6:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] +// CHECK8-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP11]], [[MUL6]] +// CHECK8-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 +// CHECK8-NEXT: [[ADD8:%.*]] = add nsw i32 [[ADD7]], [[TMP14]] +// CHECK8-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 +// CHECK8-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP15]] +// CHECK8-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 +// CHECK8-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP16]] +// CHECK8-NEXT: store i32 [[ADD8]], i32* [[ARRAYIDX9]], align 4 +// CHECK8-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK8: omp.body.continue: +// CHECK8-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK8: omp.inner.for.inc: +// CHECK8-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK8-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK8: omp.inner.for.end: +// CHECK8-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK8: omp.loop.exit: +// CHECK8-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK8-NEXT: ret void +// CHECK8-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59 +// CHECK8-SAME: (i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK8-NEXT: entry: +// CHECK8-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK8-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK8-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK8-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK8-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK8-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK8-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK8-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK8: .execute: +// CHECK8-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK8-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK8-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK8-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK8-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK8-NEXT: call void @__omp_outlined__8(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [10 x [10 x i32]]* [[TMP0]]) #[[ATTR2]] +// CHECK8-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK8: .omp.deinit: +// CHECK8-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK8-NEXT: br label [[DOTEXIT:%.*]] +// CHECK8: .exit: +// CHECK8-NEXT: ret void +// CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__8 +// CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK8-NEXT: entry: +// CHECK8-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK8-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK8-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK8-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[I9:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[J10:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK8-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK8-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK8-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK8-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK8-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK8-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK8-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK8-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK8-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 +// CHECK8-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK8-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64 +// CHECK8-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK8-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK8-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 +// CHECK8-NEXT: [[CONV6:%.*]] = sext i32 [[DIV5]] to i64 +// CHECK8-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV6]] +// CHECK8-NEXT: [[SUB7:%.*]] = sub nsw i64 [[MUL]], 1 +// CHECK8-NEXT: store i64 [[SUB7]], i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK8-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK8-NEXT: store i32 0, i32* [[J]], align 4 +// CHECK8-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK8-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK8: land.lhs.true: +// CHECK8-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK8-NEXT: [[CMP8:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK8-NEXT: br i1 [[CMP8]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] +// CHECK8: omp.precond.then: +// CHECK8-NEXT: store i64 0, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK8-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK8-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK8-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK8-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK8-NEXT: [[CONV11:%.*]] = zext i32 [[NVPTX_NUM_THREADS]] to i64 +// CHECK8-NEXT: [[TMP8:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK8-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4 +// CHECK8-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB1]], i32 [[TMP9]], i32 91, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_COMB_LB]], i64* [[DOTOMP_COMB_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 [[CONV11]]) +// CHECK8-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK8-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK8-NEXT: [[CMP12:%.*]] = icmp sgt i64 [[TMP10]], [[TMP11]] +// CHECK8-NEXT: br i1 [[CMP12]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK8: cond.true: +// CHECK8-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK8-NEXT: br label [[COND_END:%.*]] +// CHECK8: cond.false: +// CHECK8-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK8-NEXT: br label [[COND_END]] +// CHECK8: cond.end: +// CHECK8-NEXT: [[COND:%.*]] = phi i64 [ [[TMP12]], [[COND_TRUE]] ], [ [[TMP13]], [[COND_FALSE]] ] +// CHECK8-NEXT: store i64 [[COND]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK8-NEXT: [[TMP14:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK8-NEXT: store i64 [[TMP14]], i64* [[DOTOMP_IV]], align 8 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK8: omp.inner.for.cond: +// CHECK8-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK8-NEXT: [[TMP16:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK8-NEXT: [[ADD:%.*]] = add nsw i64 [[TMP16]], 1 +// CHECK8-NEXT: [[CMP13:%.*]] = icmp slt i64 [[TMP15]], [[ADD]] +// CHECK8-NEXT: br i1 [[CMP13]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK8: omp.inner.for.body: +// CHECK8-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK8-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32 +// CHECK8-NEXT: [[TMP19:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK8-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32 +// CHECK8-NEXT: [[TMP21:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK8-NEXT: store i32 [[TMP21]], i32* [[N_CASTED]], align 4 +// CHECK8-NEXT: [[TMP22:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK8-NEXT: [[TMP23:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK8-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP18]] to i8* +// CHECK8-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 +// CHECK8-NEXT: [[TMP25:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK8-NEXT: [[TMP26:%.*]] = inttoptr i32 [[TMP20]] to i8* +// CHECK8-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 +// CHECK8-NEXT: [[TMP27:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK8-NEXT: [[TMP28:%.*]] = inttoptr i32 [[TMP22]] to i8* +// CHECK8-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 +// CHECK8-NEXT: [[TMP29:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK8-NEXT: [[TMP30:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK8-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 4 +// CHECK8-NEXT: [[TMP31:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK8-NEXT: [[TMP32:%.*]] = load i32, i32* [[TMP31]], align 4 +// CHECK8-NEXT: [[TMP33:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK8-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP32]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [10 x [10 x i32]]*)* @__omp_outlined__9 to i8*), i8* null, i8** [[TMP33]], i32 4) +// CHECK8-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK8: omp.inner.for.inc: +// CHECK8-NEXT: [[TMP34:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK8-NEXT: [[TMP35:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK8-NEXT: [[ADD14:%.*]] = add nsw i64 [[TMP34]], [[TMP35]] +// CHECK8-NEXT: store i64 [[ADD14]], i64* [[DOTOMP_IV]], align 8 +// CHECK8-NEXT: [[TMP36:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK8-NEXT: [[TMP37:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK8-NEXT: [[ADD15:%.*]] = add nsw i64 [[TMP36]], [[TMP37]] +// CHECK8-NEXT: store i64 [[ADD15]], i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK8-NEXT: [[TMP38:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK8-NEXT: [[TMP39:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK8-NEXT: [[ADD16:%.*]] = add nsw i64 [[TMP38]], [[TMP39]] +// CHECK8-NEXT: store i64 [[ADD16]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK8-NEXT: [[TMP40:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK8-NEXT: [[TMP41:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK8-NEXT: [[CMP17:%.*]] = icmp sgt i64 [[TMP40]], [[TMP41]] +// CHECK8-NEXT: br i1 [[CMP17]], label [[COND_TRUE18:%.*]], label [[COND_FALSE19:%.*]] +// CHECK8: cond.true18: +// CHECK8-NEXT: [[TMP42:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK8-NEXT: br label [[COND_END20:%.*]] +// CHECK8: cond.false19: +// CHECK8-NEXT: [[TMP43:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK8-NEXT: br label [[COND_END20]] +// CHECK8: cond.end20: +// CHECK8-NEXT: [[COND21:%.*]] = phi i64 [ [[TMP42]], [[COND_TRUE18]] ], [ [[TMP43]], [[COND_FALSE19]] ] +// CHECK8-NEXT: store i64 [[COND21]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK8-NEXT: [[TMP44:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK8-NEXT: store i64 [[TMP44]], i64* [[DOTOMP_IV]], align 8 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK8: omp.inner.for.end: +// CHECK8-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK8: omp.loop.exit: +// CHECK8-NEXT: [[TMP45:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK8-NEXT: [[TMP46:%.*]] = load i32, i32* [[TMP45]], align 4 +// CHECK8-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP46]]) +// CHECK8-NEXT: br label [[OMP_PRECOND_END]] +// CHECK8: omp.precond.end: +// CHECK8-NEXT: ret void +// CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__9 +// CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK8-NEXT: entry: +// CHECK8-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK8-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK8-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK8-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_LB:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[DOTOMP_UB:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[I11:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[J12:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK8-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK8-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK8-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK8-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK8-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK8-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK8-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK8-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK8-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK8-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 +// CHECK8-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK8-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64 +// CHECK8-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK8-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK8-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 +// CHECK8-NEXT: [[CONV6:%.*]] = sext i32 [[DIV5]] to i64 +// CHECK8-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV6]] +// CHECK8-NEXT: [[SUB7:%.*]] = sub nsw i64 [[MUL]], 1 +// CHECK8-NEXT: store i64 [[SUB7]], i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK8-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK8-NEXT: store i32 0, i32* [[J]], align 4 +// CHECK8-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK8-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK8: land.lhs.true: +// CHECK8-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK8-NEXT: [[CMP8:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK8-NEXT: br i1 [[CMP8]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] +// CHECK8: omp.precond.then: +// CHECK8-NEXT: store i64 0, i64* [[DOTOMP_LB]], align 8 +// CHECK8-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK8-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_UB]], align 8 +// CHECK8-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK8-NEXT: [[CONV9:%.*]] = zext i32 [[TMP8]] to i64 +// CHECK8-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK8-NEXT: [[CONV10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK8-NEXT: store i64 [[CONV9]], i64* [[DOTOMP_LB]], align 8 +// CHECK8-NEXT: store i64 [[CONV10]], i64* [[DOTOMP_UB]], align 8 +// CHECK8-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK8-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK8-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK8-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB2]], i32 [[TMP11]], i32 33, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_LB]], i64* [[DOTOMP_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 1) +// CHECK8-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTOMP_LB]], align 8 +// CHECK8-NEXT: store i64 [[TMP12]], i64* [[DOTOMP_IV]], align 8 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK8: omp.inner.for.cond: +// CHECK8-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK8-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK8-NEXT: [[CONV13:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK8-NEXT: [[CMP14:%.*]] = icmp sle i64 [[TMP13]], [[CONV13]] +// CHECK8-NEXT: br i1 [[CMP14]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK8: omp.inner.for.body: +// CHECK8-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK8-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK8-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP16]], 0 +// CHECK8-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1 +// CHECK8-NEXT: [[MUL17:%.*]] = mul nsw i32 1, [[DIV16]] +// CHECK8-NEXT: [[CONV18:%.*]] = sext i32 [[MUL17]] to i64 +// CHECK8-NEXT: [[DIV19:%.*]] = sdiv i64 [[TMP15]], [[CONV18]] +// CHECK8-NEXT: [[MUL20:%.*]] = mul nsw i64 [[DIV19]], 1 +// CHECK8-NEXT: [[ADD:%.*]] = add nsw i64 0, [[MUL20]] +// CHECK8-NEXT: [[CONV21:%.*]] = trunc i64 [[ADD]] to i32 +// CHECK8-NEXT: store i32 [[CONV21]], i32* [[I11]], align 4 +// CHECK8-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK8-NEXT: [[TMP18:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK8-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK8-NEXT: [[SUB22:%.*]] = sub nsw i32 [[TMP19]], 0 +// CHECK8-NEXT: [[DIV23:%.*]] = sdiv i32 [[SUB22]], 1 +// CHECK8-NEXT: [[MUL24:%.*]] = mul nsw i32 1, [[DIV23]] +// CHECK8-NEXT: [[CONV25:%.*]] = sext i32 [[MUL24]] to i64 +// CHECK8-NEXT: [[DIV26:%.*]] = sdiv i64 [[TMP18]], [[CONV25]] +// CHECK8-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK8-NEXT: [[SUB27:%.*]] = sub nsw i32 [[TMP20]], 0 +// CHECK8-NEXT: [[DIV28:%.*]] = sdiv i32 [[SUB27]], 1 +// CHECK8-NEXT: [[MUL29:%.*]] = mul nsw i32 1, [[DIV28]] +// CHECK8-NEXT: [[CONV30:%.*]] = sext i32 [[MUL29]] to i64 +// CHECK8-NEXT: [[MUL31:%.*]] = mul nsw i64 [[DIV26]], [[CONV30]] +// CHECK8-NEXT: [[SUB32:%.*]] = sub nsw i64 [[TMP17]], [[MUL31]] +// CHECK8-NEXT: [[MUL33:%.*]] = mul nsw i64 [[SUB32]], 1 +// CHECK8-NEXT: [[ADD34:%.*]] = add nsw i64 0, [[MUL33]] +// CHECK8-NEXT: [[CONV35:%.*]] = trunc i64 [[ADD34]] to i32 +// CHECK8-NEXT: store i32 [[CONV35]], i32* [[J12]], align 4 +// CHECK8-NEXT: [[TMP21:%.*]] = load i32, i32* [[I11]], align 4 +// CHECK8-NEXT: [[TMP22:%.*]] = load i32, i32* [[J12]], align 4 +// CHECK8-NEXT: [[ADD36:%.*]] = add nsw i32 [[TMP21]], [[TMP22]] +// CHECK8-NEXT: [[TMP23:%.*]] = load i32, i32* [[I11]], align 4 +// CHECK8-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP23]] +// CHECK8-NEXT: [[TMP24:%.*]] = load i32, i32* [[J12]], align 4 +// CHECK8-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP24]] +// CHECK8-NEXT: store i32 [[ADD36]], i32* [[ARRAYIDX37]], align 4 +// CHECK8-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK8: omp.body.continue: +// CHECK8-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK8: omp.inner.for.inc: +// CHECK8-NEXT: [[TMP25:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK8-NEXT: [[TMP26:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK8-NEXT: [[ADD38:%.*]] = add nsw i64 [[TMP25]], [[TMP26]] +// CHECK8-NEXT: store i64 [[ADD38]], i64* [[DOTOMP_IV]], align 8 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK8: omp.inner.for.end: +// CHECK8-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK8: omp.loop.exit: +// CHECK8-NEXT: [[TMP27:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK8-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP27]], align 4 +// CHECK8-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP28]]) +// CHECK8-NEXT: br label [[OMP_PRECOND_END]] +// CHECK8: omp.precond.end: +// CHECK8-NEXT: ret void +// CHECK8-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l66 +// CHECK8-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK8-NEXT: entry: +// CHECK8-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK8-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 +// CHECK8-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK8-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK8-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK8-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 +// CHECK8-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK8-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK8-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK8-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK8: .execute: +// CHECK8-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK8-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK8-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK8-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK8-NEXT: [[TMP4:%.*]] = load i32*, i32** [[V_ADDR]], align 4 +// CHECK8-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK8-NEXT: call void @__omp_outlined__10(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32* [[TMP4]]) #[[ATTR2]] +// CHECK8-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK8: .omp.deinit: +// CHECK8-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK8-NEXT: br label [[DOTEXIT:%.*]] +// CHECK8: .exit: +// CHECK8-NEXT: ret void +// CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__10 +// CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK8-NEXT: entry: +// CHECK8-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK8-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK8-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK8-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 +// CHECK8-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 +// CHECK8-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK8-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK8-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK8-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK8-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 +// CHECK8-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK8-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK8-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK8-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK8-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK8-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK8-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK8-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK8-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK8: omp.precond.then: +// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK8-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK8-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK8-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK8-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK8-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK8-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK8-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK8-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK8: cond.true: +// CHECK8-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK8-NEXT: br label [[COND_END:%.*]] +// CHECK8: cond.false: +// CHECK8-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: br label [[COND_END]] +// CHECK8: cond.end: +// CHECK8-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK8-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK8: omp.inner.for.cond: +// CHECK8-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK8-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK8-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK8-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK8: omp.inner.for.body: +// CHECK8-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK8-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 +// CHECK8-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK8-NEXT: [[TMP18:%.*]] = load i32*, i32** [[V_ADDR]], align 4 +// CHECK8-NEXT: [[TMP19:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK8-NEXT: [[TMP20:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK8-NEXT: store i8* [[TMP20]], i8** [[TMP19]], align 4 +// CHECK8-NEXT: [[TMP21:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK8-NEXT: [[TMP22:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK8-NEXT: store i8* [[TMP22]], i8** [[TMP21]], align 4 +// CHECK8-NEXT: [[TMP23:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK8-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK8-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 +// CHECK8-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK8-NEXT: [[TMP26:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK8-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 +// CHECK8-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK8-NEXT: [[TMP28:%.*]] = bitcast i32* [[TMP18]] to i8* +// CHECK8-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 +// CHECK8-NEXT: [[TMP29:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK8-NEXT: [[TMP30:%.*]] = load i32, i32* [[TMP29]], align 4 +// CHECK8-NEXT: [[TMP31:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK8-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP30]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32*)* @__omp_outlined__11 to i8*), i8* null, i8** [[TMP31]], i32 5) +// CHECK8-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK8: omp.inner.for.inc: +// CHECK8-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP32]], [[TMP33]] +// CHECK8-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP34]], [[TMP35]] +// CHECK8-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP36]], [[TMP37]] +// CHECK8-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK8-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP38]], [[TMP39]] +// CHECK8-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] +// CHECK8: cond.true10: +// CHECK8-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK8-NEXT: br label [[COND_END12:%.*]] +// CHECK8: cond.false11: +// CHECK8-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: br label [[COND_END12]] +// CHECK8: cond.end12: +// CHECK8-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP40]], [[COND_TRUE10]] ], [ [[TMP41]], [[COND_FALSE11]] ] +// CHECK8-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: store i32 [[TMP42]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK8: omp.inner.for.end: +// CHECK8-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK8: omp.loop.exit: +// CHECK8-NEXT: [[TMP43:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK8-NEXT: [[TMP44:%.*]] = load i32, i32* [[TMP43]], align 4 +// CHECK8-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP44]]) +// CHECK8-NEXT: br label [[OMP_PRECOND_END]] +// CHECK8: omp.precond.end: +// CHECK8-NEXT: ret void +// CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__11 +// CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK8-NEXT: entry: +// CHECK8-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK8-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK8-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK8-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 +// CHECK8-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK8-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK8-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK8-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK8-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK8-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK8-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 +// CHECK8-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK8-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK8-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK8-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK8-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK8-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK8-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK8-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK8-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK8-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK8: omp.precond.then: +// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK8-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK8-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK8-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK8-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK8-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK8-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK8-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK8-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK8-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK8-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK8-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK8-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK8: omp.inner.for.cond: +// CHECK8-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK8-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] +// CHECK8-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK8: omp.inner.for.body: +// CHECK8-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK8-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK8-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK8-NEXT: [[TMP13:%.*]] = load i32*, i32** [[V_ADDR]], align 4 +// CHECK8-NEXT: [[TMP14:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK8-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TMP13]], i32 [[TMP14]] +// CHECK8-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK8-NEXT: [[TMP16:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK8-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP16]] +// CHECK8-NEXT: store i32 [[TMP15]], i32* [[ARRAYIDX5]], align 4 +// CHECK8-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK8: omp.body.continue: +// CHECK8-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK8: omp.inner.for.inc: +// CHECK8-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK8-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK8: omp.inner.for.end: +// CHECK8-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK8: omp.loop.exit: +// CHECK8-NEXT: [[TMP19:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK8-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4 +// CHECK8-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP20]]) +// CHECK8-NEXT: br label [[OMP_PRECOND_END]] +// CHECK8: omp.precond.end: +// CHECK8-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l28 +// CHECK1-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK1-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[L_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK1-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14:![0-9]+]] +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK1: .execute: +// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[CONV2:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK1-NEXT: store i32 [[TMP2]], i32* [[CONV2]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK1-NEXT: [[CONV3:%.*]] = bitcast i64* [[L_CASTED]] to i32* +// CHECK1-NEXT: store i32 [[TMP4]], i32* [[CONV3]], align 4 +// CHECK1-NEXT: [[TMP5:%.*]] = load i64, i64* [[L_CASTED]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK1-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i32]* [[TMP0]], i64 [[TMP5]]) #[[ATTR2:[0-9]+]] +// CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK1: .omp.deinit: +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void // // -// CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 -// CHECK13-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK13-NEXT: entry: -// CHECK13-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK13-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK13-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK13-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK13-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK13-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK13-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK13: .execute: -// CHECK13-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK13-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK13-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] -// CHECK13-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK13: .omp.deinit: -// CHECK13-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK13-NEXT: br label [[DOTEXIT:%.*]] -// CHECK13: .exit: -// CHECK13-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK1-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[L_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 8 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK1-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* +// CHECK1-NEXT: [[L2:%.*]] = call i8* @__kmpc_alloc_shared(i64 4) +// CHECK1-NEXT: [[L_ON_STACK:%.*]] = bitcast i8* [[L2]] to i32* +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK1-NEXT: [[SUB4:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK1-NEXT: store i32 [[SUB4]], i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK1: omp.precond.then: +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK1-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK1-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK1-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK1: cond.true: +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK1-NEXT: br label [[COND_END:%.*]] +// CHECK1: cond.false: +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END]] +// CHECK1: cond.end: +// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK1-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK1-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK1-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 +// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[CONV8:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK1-NEXT: store i32 [[TMP18]], i32* [[CONV8]], align 4 +// CHECK1-NEXT: [[TMP19:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK1-NEXT: [[TMP20:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK1-NEXT: [[CONV9:%.*]] = bitcast i64* [[L_CASTED]] to i32* +// CHECK1-NEXT: store i32 [[TMP20]], i32* [[CONV9]], align 4 +// CHECK1-NEXT: [[TMP21:%.*]] = load i64, i64* [[L_CASTED]], align 8 +// CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP23:%.*]] = inttoptr i64 [[TMP15]] to i8* +// CHECK1-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 8 +// CHECK1-NEXT: [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP17]] to i8* +// CHECK1-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 8 +// CHECK1-NEXT: [[TMP26:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK1-NEXT: [[TMP27:%.*]] = inttoptr i64 [[TMP19]] to i8* +// CHECK1-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 8 +// CHECK1-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK1-NEXT: [[TMP29:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK1-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 8 +// CHECK1-NEXT: [[TMP30:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 4 +// CHECK1-NEXT: [[TMP31:%.*]] = inttoptr i64 [[TMP21]] to i8* +// CHECK1-NEXT: store i8* [[TMP31]], i8** [[TMP30]], align 8 +// CHECK1-NEXT: [[TMP32:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP33:%.*]] = load i32, i32* [[TMP32]], align 4 +// CHECK1-NEXT: [[TMP34:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP33]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i32]*, i64)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP34]], i64 5) +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] +// CHECK1-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP37]], [[TMP38]] +// CHECK1-NEXT: store i32 [[ADD11]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD12:%.*]] = add nsw i32 [[TMP39]], [[TMP40]] +// CHECK1-NEXT: store i32 [[ADD12]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK1-NEXT: [[CMP13:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]] +// CHECK1-NEXT: br i1 [[CMP13]], label [[COND_TRUE14:%.*]], label [[COND_FALSE15:%.*]] +// CHECK1: cond.true14: +// CHECK1-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK1-NEXT: br label [[COND_END16:%.*]] +// CHECK1: cond.false15: +// CHECK1-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END16]] +// CHECK1: cond.end16: +// CHECK1-NEXT: [[COND17:%.*]] = phi i32 [ [[TMP43]], [[COND_TRUE14]] ], [ [[TMP44]], [[COND_FALSE15]] ] +// CHECK1-NEXT: store i32 [[COND17]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP45]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: [[TMP46:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP47:%.*]] = load i32, i32* [[TMP46]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP47]]) +// CHECK1-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP49:%.*]] = icmp ne i32 [[TMP48]], 0 +// CHECK1-NEXT: br i1 [[TMP49]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK1: .omp.lastprivate.then: +// CHECK1-NEXT: [[TMP50:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK1-NEXT: store i32 [[TMP50]], i32* [[CONV1]], align 8 +// CHECK1-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK1: .omp.lastprivate.done: +// CHECK1-NEXT: br label [[OMP_PRECOND_END]] +// CHECK1: omp.precond.end: +// CHECK1-NEXT: call void @__kmpc_free_shared(i8* [[L2]]) +// CHECK1-NEXT: ret void // // -// CHECK13-LABEL: define {{[^@]+}}@__omp_outlined__4 -// CHECK13-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK13-NEXT: entry: -// CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK13-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK13-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 -// CHECK13-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK13-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK13-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK13-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK13-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK13-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK13-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK13-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK13-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK13-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK13-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 -// CHECK13-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK13-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK13-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 -// CHECK13-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK13: cond.true: -// CHECK13-NEXT: br label [[COND_END:%.*]] -// CHECK13: cond.false: -// CHECK13-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK13-NEXT: br label [[COND_END]] -// CHECK13: cond.end: -// CHECK13-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] -// CHECK13-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK13-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK13-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK13-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK13: omp.inner.for.cond: -// CHECK13-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK13-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 -// CHECK13-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK13: omp.inner.for.body: -// CHECK13-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK13-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK13-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK13-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP7]] to i8* -// CHECK13-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 -// CHECK13-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK13-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP8]] to i8* -// CHECK13-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 -// CHECK13-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK13-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* -// CHECK13-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 -// CHECK13-NEXT: [[TMP15:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK13-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP15]], i32 3) -// CHECK13-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK13: omp.inner.for.inc: -// CHECK13-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK13-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK13-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP17]] -// CHECK13-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 -// CHECK13-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK13-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK13-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] -// CHECK13-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK13-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK13-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK13-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] -// CHECK13-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK13-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK13-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP22]], 9 -// CHECK13-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] -// CHECK13: cond.true5: -// CHECK13-NEXT: br label [[COND_END7:%.*]] -// CHECK13: cond.false6: -// CHECK13-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK13-NEXT: br label [[COND_END7]] -// CHECK13: cond.end7: -// CHECK13-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP23]], [[COND_FALSE6]] ] -// CHECK13-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK13-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK13-NEXT: store i32 [[TMP24]], i32* [[DOTOMP_IV]], align 4 -// CHECK13-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK13: omp.inner.for.end: -// CHECK13-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK13: omp.loop.exit: -// CHECK13-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) -// CHECK13-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK1-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I6:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK1-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK1-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK1-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK1: omp.precond.then: +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK1-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK1-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP5]] to i32 +// CHECK1-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: [[CONV5:%.*]] = trunc i64 [[TMP6]] to i32 +// CHECK1-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[CONV5]], i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) +// CHECK1-NEXT: br label [[OMP_DISPATCH_COND:%.*]] +// CHECK1: omp.dispatch.cond: +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[CONV7:%.*]] = sext i32 [[TMP9]] to i64 +// CHECK1-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: [[CMP8:%.*]] = icmp ugt i64 [[CONV7]], [[TMP10]] +// CHECK1-NEXT: br i1 [[CMP8]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK1: cond.true: +// CHECK1-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: br label [[COND_END:%.*]] +// CHECK1: cond.false: +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[CONV9:%.*]] = sext i32 [[TMP12]] to i64 +// CHECK1-NEXT: br label [[COND_END]] +// CHECK1: cond.end: +// CHECK1-NEXT: [[COND:%.*]] = phi i64 [ [[TMP11]], [[COND_TRUE]] ], [ [[CONV9]], [[COND_FALSE]] ] +// CHECK1-NEXT: [[CONV10:%.*]] = trunc i64 [[COND]] to i32 +// CHECK1-NEXT: store i32 [[CONV10]], i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[CMP11:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] +// CHECK1-NEXT: br i1 [[CMP11]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] +// CHECK1: omp.dispatch.body: +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[CMP12:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] +// CHECK1-NEXT: br i1 [[CMP12]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK1-NEXT: store i32 [[ADD]], i32* [[I6]], align 4 +// CHECK1-NEXT: [[TMP19:%.*]] = load i32, i32* [[I6]], align 4 +// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64 +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK1-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 +// CHECK1-NEXT: [[TMP20:%.*]] = load i32, i32* [[I6]], align 4 +// CHECK1-NEXT: store i32 [[TMP20]], i32* [[CONV1]], align 8 +// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK1: omp.body.continue: +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[ADD13:%.*]] = add nsw i32 [[TMP21]], 1 +// CHECK1-NEXT: store i32 [[ADD13]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_DISPATCH_INC:%.*]] +// CHECK1: omp.dispatch.inc: +// CHECK1-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK1-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD15:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK1-NEXT: store i32 [[ADD15]], i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: br label [[OMP_DISPATCH_COND]] +// CHECK1: omp.dispatch.end: +// CHECK1-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) +// CHECK1-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 +// CHECK1-NEXT: br i1 [[TMP29]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK1: .omp.lastprivate.then: +// CHECK1-NEXT: [[TMP30:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK1-NEXT: store i32 [[TMP30]], i32* [[CONV1]], align 8 +// CHECK1-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK1: .omp.lastprivate.done: +// CHECK1-NEXT: br label [[OMP_PRECOND_END]] +// CHECK1: omp.precond.end: +// CHECK1-NEXT: ret void // // -// CHECK13-LABEL: define {{[^@]+}}@__omp_outlined__5 -// CHECK13-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK13-NEXT: entry: -// CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK13-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK13-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK13-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK13-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK13-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK13-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK13-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK13-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK13-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 -// CHECK13-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK13-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK13-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 -// CHECK13-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 -// CHECK13-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK13-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK13-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK13-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 -// CHECK13-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK13-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK13-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK13-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK13: omp.inner.for.cond: -// CHECK13-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK13-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK13-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] -// CHECK13-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK13: omp.inner.for.body: -// CHECK13-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK13-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 -// CHECK13-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK13-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK13-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 -// CHECK13-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 [[TMP9]] -// CHECK13-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -// CHECK13-NEXT: [[ADD1:%.*]] = add nsw i32 [[TMP10]], 1 -// CHECK13-NEXT: store i32 [[ADD1]], i32* [[ARRAYIDX]], align 4 -// CHECK13-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK13: omp.body.continue: -// CHECK13-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK13: omp.inner.for.inc: -// CHECK13-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK13-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK13-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] -// CHECK13-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 -// CHECK13-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK13: omp.inner.for.end: -// CHECK13-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK13: omp.loop.exit: -// CHECK13-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) -// CHECK13-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34 +// CHECK1-SAME: (i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 +// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK1-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK1: .execute: +// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK1-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK1-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] +// CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK1: .omp.deinit: +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void // // -// CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59 -// CHECK13-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { -// CHECK13-NEXT: entry: -// CHECK13-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK13-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK13-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK13-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 -// CHECK13-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK13-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK13-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK13-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK13: .execute: -// CHECK13-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK13-NEXT: [[TMP2:%.*]] = load i32, i32* [[F_ADDR]], align 4 -// CHECK13-NEXT: store i32 [[TMP2]], i32* [[F_CASTED]], align 4 -// CHECK13-NEXT: [[TMP3:%.*]] = load i32, i32* [[F_CASTED]], align 4 -// CHECK13-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK13-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i32 [[TMP3]]) #[[ATTR2]] -// CHECK13-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK13: .omp.deinit: -// CHECK13-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK13-NEXT: br label [[DOTEXIT:%.*]] -// CHECK13: .exit: -// CHECK13-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK1-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK1-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK1-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK1: omp.precond.then: +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK1-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK1-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK1: cond.true: +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: br label [[COND_END:%.*]] +// CHECK1: cond.false: +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END]] +// CHECK1: cond.end: +// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK1-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK1-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK1-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 +// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[CONV6:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK1-NEXT: store i32 [[TMP18]], i32* [[CONV6]], align 4 +// CHECK1-NEXT: [[TMP19:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK1-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP21:%.*]] = inttoptr i64 [[TMP15]] to i8* +// CHECK1-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 8 +// CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP23:%.*]] = inttoptr i64 [[TMP17]] to i8* +// CHECK1-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 8 +// CHECK1-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK1-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP19]] to i8* +// CHECK1-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 8 +// CHECK1-NEXT: [[TMP26:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK1-NEXT: [[TMP27:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* +// CHECK1-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 8 +// CHECK1-NEXT: [[TMP28:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP29:%.*]] = load i32, i32* [[TMP28]], align 4 +// CHECK1-NEXT: [[TMP30:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP29]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP30]], i64 4) +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] +// CHECK1-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK1-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] +// CHECK1-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP37]], [[TMP38]] +// CHECK1-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK1: cond.true11: +// CHECK1-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: br label [[COND_END13:%.*]] +// CHECK1: cond.false12: +// CHECK1-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END13]] +// CHECK1: cond.end13: +// CHECK1-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP39]], [[COND_TRUE11]] ], [ [[TMP40]], [[COND_FALSE12]] ] +// CHECK1-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP41]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: [[TMP42:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP43:%.*]] = load i32, i32* [[TMP42]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP43]]) +// CHECK1-NEXT: br label [[OMP_PRECOND_END]] +// CHECK1: omp.precond.end: +// CHECK1-NEXT: ret void // // -// CHECK13-LABEL: define {{[^@]+}}@__omp_outlined__6 -// CHECK13-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { -// CHECK13-NEXT: entry: -// CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK13-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK13-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[K:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 -// CHECK13-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK13-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK13-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK13-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 -// CHECK13-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK13-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK13-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK13-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK13-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK13-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK13-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK13-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 -// CHECK13-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK13-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK13-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 -// CHECK13-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK13: cond.true: -// CHECK13-NEXT: br label [[COND_END:%.*]] -// CHECK13: cond.false: -// CHECK13-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK13-NEXT: br label [[COND_END]] -// CHECK13: cond.end: -// CHECK13-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] -// CHECK13-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK13-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK13-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK13-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK13: omp.inner.for.cond: -// CHECK13-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK13-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 -// CHECK13-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK13: omp.inner.for.body: -// CHECK13-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK13-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK13-NEXT: [[TMP9:%.*]] = load i32, i32* [[F_ADDR]], align 4 -// CHECK13-NEXT: store i32 [[TMP9]], i32* [[F_CASTED]], align 4 -// CHECK13-NEXT: [[TMP10:%.*]] = load i32, i32* [[F_CASTED]], align 4 -// CHECK13-NEXT: [[TMP11:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK13-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP7]] to i8* -// CHECK13-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 -// CHECK13-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK13-NEXT: [[TMP14:%.*]] = inttoptr i32 [[TMP8]] to i8* -// CHECK13-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 -// CHECK13-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK13-NEXT: [[TMP16:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* -// CHECK13-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 4 -// CHECK13-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK13-NEXT: [[TMP18:%.*]] = inttoptr i32 [[TMP10]] to i8* -// CHECK13-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 4 -// CHECK13-NEXT: [[TMP19:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK13-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x [10 x i32]]*, i32)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP19]], i32 4) -// CHECK13-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK13: omp.inner.for.inc: -// CHECK13-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK13-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK13-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] -// CHECK13-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 -// CHECK13-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK13-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK13-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK13-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK13-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK13-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK13-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] -// CHECK13-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK13-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK13-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP26]], 99 -// CHECK13-NEXT: br i1 [[CMP5]], label [[COND_TRUE6:%.*]], label [[COND_FALSE7:%.*]] -// CHECK13: cond.true6: -// CHECK13-NEXT: br label [[COND_END8:%.*]] -// CHECK13: cond.false7: -// CHECK13-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK13-NEXT: br label [[COND_END8]] -// CHECK13: cond.end8: -// CHECK13-NEXT: [[COND9:%.*]] = phi i32 [ 99, [[COND_TRUE6]] ], [ [[TMP27]], [[COND_FALSE7]] ] -// CHECK13-NEXT: store i32 [[COND9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK13-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK13-NEXT: store i32 [[TMP28]], i32* [[DOTOMP_IV]], align 4 -// CHECK13-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK13: omp.inner.for.end: -// CHECK13-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK13: omp.loop.exit: -// CHECK13-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) -// CHECK13-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK1-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK1-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK1-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK1: omp.precond.then: +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK1-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP5]] to i32 +// CHECK1-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP6]] to i32 +// CHECK1-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[CONV6:%.*]] = sext i32 [[TMP10]] to i64 +// CHECK1-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: [[CMP7:%.*]] = icmp ule i64 [[CONV6]], [[TMP11]] +// CHECK1-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK1-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 +// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP13]] to i64 +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK1-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +// CHECK1-NEXT: [[CONV8:%.*]] = sext i16 [[TMP14]] to i32 +// CHECK1-NEXT: [[ADD9:%.*]] = add nsw i32 [[CONV8]], 1 +// CHECK1-NEXT: [[CONV10:%.*]] = trunc i32 [[ADD9]] to i16 +// CHECK1-NEXT: store i16 [[CONV10]], i16* [[ARRAYIDX]], align 2 +// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK1: omp.body.continue: +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] +// CHECK1-NEXT: store i32 [[ADD11]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) +// CHECK1-NEXT: br label [[OMP_PRECOND_END]] +// CHECK1: omp.precond.end: +// CHECK1-NEXT: ret void // // -// CHECK13-LABEL: define {{[^@]+}}@__omp_outlined__7 -// CHECK13-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { -// CHECK13-NEXT: entry: -// CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK13-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK13-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[K:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK13-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK13-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK13-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK13-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK13-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 -// CHECK13-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK13-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK13-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 -// CHECK13-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK13-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK13-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 -// CHECK13-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 -// CHECK13-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK13-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK13-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK13-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 -// CHECK13-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK13-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK13-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK13-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK13: omp.inner.for.cond: -// CHECK13-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK13-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK13-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] -// CHECK13-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK13: omp.inner.for.body: -// CHECK13-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK13-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 -// CHECK13-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 -// CHECK13-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK13-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK13-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK13-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK13-NEXT: [[DIV2:%.*]] = sdiv i32 [[TMP10]], 10 -// CHECK13-NEXT: [[MUL3:%.*]] = mul nsw i32 [[DIV2]], 10 -// CHECK13-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL3]] -// CHECK13-NEXT: [[MUL4:%.*]] = mul nsw i32 [[SUB]], 1 -// CHECK13-NEXT: [[ADD5:%.*]] = add nsw i32 0, [[MUL4]] -// CHECK13-NEXT: store i32 [[ADD5]], i32* [[J]], align 4 -// CHECK13-NEXT: store i32 10, i32* [[K]], align 4 -// CHECK13-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 -// CHECK13-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 -// CHECK13-NEXT: [[TMP13:%.*]] = load i32, i32* [[F_ADDR]], align 4 -// CHECK13-NEXT: [[MUL6:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] -// CHECK13-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP11]], [[MUL6]] -// CHECK13-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 -// CHECK13-NEXT: [[ADD8:%.*]] = add nsw i32 [[ADD7]], [[TMP14]] -// CHECK13-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 -// CHECK13-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP15]] -// CHECK13-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 -// CHECK13-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP16]] -// CHECK13-NEXT: store i32 [[ADD8]], i32* [[ARRAYIDX9]], align 4 -// CHECK13-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK13: omp.body.continue: -// CHECK13-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK13: omp.inner.for.inc: -// CHECK13-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK13-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK13-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] -// CHECK13-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 -// CHECK13-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK13: omp.inner.for.end: -// CHECK13-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK13: omp.loop.exit: -// CHECK13-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) -// CHECK13-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l39 +// CHECK1-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK1: .execute: +// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK1-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] +// CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK1: .omp.deinit: +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 8 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK1-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 +// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK1: cond.true: +// CHECK1-NEXT: br label [[COND_END:%.*]] +// CHECK1: cond.false: +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END]] +// CHECK1: cond.end: +// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK1-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 +// CHECK1-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP8]] to i8* +// CHECK1-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 8 +// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP10]] to i8* +// CHECK1-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 +// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK1-NEXT: [[TMP16:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK1-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 +// CHECK1-NEXT: [[TMP17:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP17]], i64 3) +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] +// CHECK1-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK1-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK1-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP24]], 9 +// CHECK1-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] +// CHECK1: cond.true5: +// CHECK1-NEXT: br label [[COND_END7:%.*]] +// CHECK1: cond.false6: +// CHECK1-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END7]] +// CHECK1: cond.end7: +// CHECK1-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP25]], [[COND_FALSE6]] ] +// CHECK1-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP26]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__5 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = trunc i64 [[TMP1]] to i32 +// CHECK1-NEXT: [[TMP2:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: [[CONV1:%.*]] = trunc i64 [[TMP2]] to i32 +// CHECK1-NEXT: store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[CONV1]], i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[CONV2:%.*]] = sext i32 [[TMP6]] to i64 +// CHECK1-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV2]], [[TMP7]] +// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK1-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 +// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP9]] to i64 +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK1-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK1-NEXT: store i32 [[ADD3]], i32* [[ARRAYIDX]], align 4 +// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK1: omp.body.continue: +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] +// CHECK1-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK1-NEXT: ret void // // -// CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l67 -// CHECK13-SAME: (i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { -// CHECK13-NEXT: entry: -// CHECK13-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK13-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK13-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK13-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK13-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK13-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK13-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK13-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK13: .execute: -// CHECK13-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK13-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK13-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 -// CHECK13-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK13-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK13-NEXT: call void @__omp_outlined__8(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [10 x [10 x i32]]* [[TMP0]]) #[[ATTR2]] -// CHECK13-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK13: .omp.deinit: -// CHECK13-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK13-NEXT: br label [[DOTEXIT:%.*]] -// CHECK13: .exit: -// CHECK13-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44 +// CHECK1-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK1-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK1: .execute: +// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[F_CASTED]] to i32* +// CHECK1-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i64, i64* [[F_CASTED]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK1-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i64 [[TMP3]]) #[[ATTR2]] +// CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK1: .omp.deinit: +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void // // -// CHECK13-LABEL: define {{[^@]+}}@__omp_outlined__8 -// CHECK13-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { -// CHECK13-NEXT: entry: -// CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK13-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK13-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 -// CHECK13-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 -// CHECK13-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i64, align 8 -// CHECK13-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i64, align 8 -// CHECK13-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 -// CHECK13-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[I9:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[J10:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 -// CHECK13-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK13-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK13-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK13-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK13-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK13-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK13-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK13-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK13-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK13-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK13-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 -// CHECK13-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK13-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64 -// CHECK13-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK13-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 -// CHECK13-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 -// CHECK13-NEXT: [[CONV6:%.*]] = sext i32 [[DIV5]] to i64 -// CHECK13-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV6]] -// CHECK13-NEXT: [[SUB7:%.*]] = sub nsw i64 [[MUL]], 1 -// CHECK13-NEXT: store i64 [[SUB7]], i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK13-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK13-NEXT: store i32 0, i32* [[J]], align 4 -// CHECK13-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK13-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] -// CHECK13-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK13: land.lhs.true: -// CHECK13-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK13-NEXT: [[CMP8:%.*]] = icmp slt i32 0, [[TMP6]] -// CHECK13-NEXT: br i1 [[CMP8]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] -// CHECK13: omp.precond.then: -// CHECK13-NEXT: store i64 0, i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK13-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK13-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK13-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK13-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK13-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK13-NEXT: [[CONV11:%.*]] = zext i32 [[NVPTX_NUM_THREADS]] to i64 -// CHECK13-NEXT: [[TMP8:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK13-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4 -// CHECK13-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB1]], i32 [[TMP9]], i32 91, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_COMB_LB]], i64* [[DOTOMP_COMB_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 [[CONV11]]) -// CHECK13-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK13-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK13-NEXT: [[CMP12:%.*]] = icmp sgt i64 [[TMP10]], [[TMP11]] -// CHECK13-NEXT: br i1 [[CMP12]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK13: cond.true: -// CHECK13-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK13-NEXT: br label [[COND_END:%.*]] -// CHECK13: cond.false: -// CHECK13-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK13-NEXT: br label [[COND_END]] -// CHECK13: cond.end: -// CHECK13-NEXT: [[COND:%.*]] = phi i64 [ [[TMP12]], [[COND_TRUE]] ], [ [[TMP13]], [[COND_FALSE]] ] -// CHECK13-NEXT: store i64 [[COND]], i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK13-NEXT: [[TMP14:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK13-NEXT: store i64 [[TMP14]], i64* [[DOTOMP_IV]], align 8 -// CHECK13-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK13: omp.inner.for.cond: -// CHECK13-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK13-NEXT: [[TMP16:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK13-NEXT: [[ADD:%.*]] = add nsw i64 [[TMP16]], 1 -// CHECK13-NEXT: [[CMP13:%.*]] = icmp slt i64 [[TMP15]], [[ADD]] -// CHECK13-NEXT: br i1 [[CMP13]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK13: omp.inner.for.body: -// CHECK13-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK13-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32 -// CHECK13-NEXT: [[TMP19:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK13-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32 -// CHECK13-NEXT: [[TMP21:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK13-NEXT: store i32 [[TMP21]], i32* [[N_CASTED]], align 4 -// CHECK13-NEXT: [[TMP22:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK13-NEXT: [[TMP23:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK13-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP18]] to i8* -// CHECK13-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 -// CHECK13-NEXT: [[TMP25:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK13-NEXT: [[TMP26:%.*]] = inttoptr i32 [[TMP20]] to i8* -// CHECK13-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 -// CHECK13-NEXT: [[TMP27:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK13-NEXT: [[TMP28:%.*]] = inttoptr i32 [[TMP22]] to i8* -// CHECK13-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 -// CHECK13-NEXT: [[TMP29:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK13-NEXT: [[TMP30:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* -// CHECK13-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 4 -// CHECK13-NEXT: [[TMP31:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK13-NEXT: [[TMP32:%.*]] = load i32, i32* [[TMP31]], align 4 -// CHECK13-NEXT: [[TMP33:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK13-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP32]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [10 x [10 x i32]]*)* @__omp_outlined__9 to i8*), i8* null, i8** [[TMP33]], i32 4) -// CHECK13-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK13: omp.inner.for.inc: -// CHECK13-NEXT: [[TMP34:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK13-NEXT: [[TMP35:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK13-NEXT: [[ADD14:%.*]] = add nsw i64 [[TMP34]], [[TMP35]] -// CHECK13-NEXT: store i64 [[ADD14]], i64* [[DOTOMP_IV]], align 8 -// CHECK13-NEXT: [[TMP36:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK13-NEXT: [[TMP37:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK13-NEXT: [[ADD15:%.*]] = add nsw i64 [[TMP36]], [[TMP37]] -// CHECK13-NEXT: store i64 [[ADD15]], i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK13-NEXT: [[TMP38:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK13-NEXT: [[TMP39:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK13-NEXT: [[ADD16:%.*]] = add nsw i64 [[TMP38]], [[TMP39]] -// CHECK13-NEXT: store i64 [[ADD16]], i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK13-NEXT: [[TMP40:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK13-NEXT: [[TMP41:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK13-NEXT: [[CMP17:%.*]] = icmp sgt i64 [[TMP40]], [[TMP41]] -// CHECK13-NEXT: br i1 [[CMP17]], label [[COND_TRUE18:%.*]], label [[COND_FALSE19:%.*]] -// CHECK13: cond.true18: -// CHECK13-NEXT: [[TMP42:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK13-NEXT: br label [[COND_END20:%.*]] -// CHECK13: cond.false19: -// CHECK13-NEXT: [[TMP43:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK13-NEXT: br label [[COND_END20]] -// CHECK13: cond.end20: -// CHECK13-NEXT: [[COND21:%.*]] = phi i64 [ [[TMP42]], [[COND_TRUE18]] ], [ [[TMP43]], [[COND_FALSE19]] ] -// CHECK13-NEXT: store i64 [[COND21]], i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK13-NEXT: [[TMP44:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK13-NEXT: store i64 [[TMP44]], i64* [[DOTOMP_IV]], align 8 -// CHECK13-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK13: omp.inner.for.end: -// CHECK13-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK13: omp.loop.exit: -// CHECK13-NEXT: [[TMP45:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK13-NEXT: [[TMP46:%.*]] = load i32, i32* [[TMP45]], align 4 -// CHECK13-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP46]]) -// CHECK13-NEXT: br label [[OMP_PRECOND_END]] -// CHECK13: omp.precond.end: -// CHECK13-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__6 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK1-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK1-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 +// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK1: cond.true: +// CHECK1-NEXT: br label [[COND_END:%.*]] +// CHECK1: cond.false: +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END]] +// CHECK1: cond.end: +// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK1-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 +// CHECK1-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[CONV3:%.*]] = bitcast i64* [[F_CASTED]] to i32* +// CHECK1-NEXT: store i32 [[TMP11]], i32* [[CONV3]], align 4 +// CHECK1-NEXT: [[TMP12:%.*]] = load i64, i64* [[F_CASTED]], align 8 +// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP8]] to i8* +// CHECK1-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 +// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP16:%.*]] = inttoptr i64 [[TMP10]] to i8* +// CHECK1-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 +// CHECK1-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK1-NEXT: [[TMP18:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK1-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 8 +// CHECK1-NEXT: [[TMP19:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK1-NEXT: [[TMP20:%.*]] = inttoptr i64 [[TMP12]] to i8* +// CHECK1-NEXT: store i8* [[TMP20]], i8** [[TMP19]], align 8 +// CHECK1-NEXT: [[TMP21:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, [10 x [10 x i32]]*, i64)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP21]], i64 4) +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK1-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK1-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP26]], [[TMP27]] +// CHECK1-NEXT: store i32 [[ADD5]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP28]], 99 +// CHECK1-NEXT: br i1 [[CMP6]], label [[COND_TRUE7:%.*]], label [[COND_FALSE8:%.*]] +// CHECK1: cond.true7: +// CHECK1-NEXT: br label [[COND_END9:%.*]] +// CHECK1: cond.false8: +// CHECK1-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END9]] +// CHECK1: cond.end9: +// CHECK1-NEXT: [[COND10:%.*]] = phi i32 [ 99, [[COND_TRUE7]] ], [ [[TMP29]], [[COND_FALSE8]] ] +// CHECK1-NEXT: store i32 [[COND10]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP30]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK1-NEXT: ret void // // -// CHECK13-LABEL: define {{[^@]+}}@__omp_outlined__9 -// CHECK13-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { -// CHECK13-NEXT: entry: -// CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK13-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK13-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 -// CHECK13-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 -// CHECK13-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[DOTOMP_LB:%.*]] = alloca i64, align 8 -// CHECK13-NEXT: [[DOTOMP_UB:%.*]] = alloca i64, align 8 -// CHECK13-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 -// CHECK13-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[I11:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[J12:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK13-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK13-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK13-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK13-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK13-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK13-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK13-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK13-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK13-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK13-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK13-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK13-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 -// CHECK13-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK13-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64 -// CHECK13-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK13-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 -// CHECK13-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 -// CHECK13-NEXT: [[CONV6:%.*]] = sext i32 [[DIV5]] to i64 -// CHECK13-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV6]] -// CHECK13-NEXT: [[SUB7:%.*]] = sub nsw i64 [[MUL]], 1 -// CHECK13-NEXT: store i64 [[SUB7]], i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK13-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK13-NEXT: store i32 0, i32* [[J]], align 4 -// CHECK13-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK13-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] -// CHECK13-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK13: land.lhs.true: -// CHECK13-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK13-NEXT: [[CMP8:%.*]] = icmp slt i32 0, [[TMP6]] -// CHECK13-NEXT: br i1 [[CMP8]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] -// CHECK13: omp.precond.then: -// CHECK13-NEXT: store i64 0, i64* [[DOTOMP_LB]], align 8 -// CHECK13-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK13-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_UB]], align 8 -// CHECK13-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK13-NEXT: [[CONV9:%.*]] = zext i32 [[TMP8]] to i64 -// CHECK13-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK13-NEXT: [[CONV10:%.*]] = zext i32 [[TMP9]] to i64 -// CHECK13-NEXT: store i64 [[CONV9]], i64* [[DOTOMP_LB]], align 8 -// CHECK13-NEXT: store i64 [[CONV10]], i64* [[DOTOMP_UB]], align 8 -// CHECK13-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK13-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK13-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK13-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 -// CHECK13-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB2]], i32 [[TMP11]], i32 33, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_LB]], i64* [[DOTOMP_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 1) -// CHECK13-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTOMP_LB]], align 8 -// CHECK13-NEXT: store i64 [[TMP12]], i64* [[DOTOMP_IV]], align 8 -// CHECK13-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK13: omp.inner.for.cond: -// CHECK13-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK13-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK13-NEXT: [[CONV13:%.*]] = zext i32 [[TMP14]] to i64 -// CHECK13-NEXT: [[CMP14:%.*]] = icmp sle i64 [[TMP13]], [[CONV13]] -// CHECK13-NEXT: br i1 [[CMP14]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK13: omp.inner.for.body: -// CHECK13-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK13-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK13-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP16]], 0 -// CHECK13-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1 -// CHECK13-NEXT: [[MUL17:%.*]] = mul nsw i32 1, [[DIV16]] -// CHECK13-NEXT: [[CONV18:%.*]] = sext i32 [[MUL17]] to i64 -// CHECK13-NEXT: [[DIV19:%.*]] = sdiv i64 [[TMP15]], [[CONV18]] -// CHECK13-NEXT: [[MUL20:%.*]] = mul nsw i64 [[DIV19]], 1 -// CHECK13-NEXT: [[ADD:%.*]] = add nsw i64 0, [[MUL20]] -// CHECK13-NEXT: [[CONV21:%.*]] = trunc i64 [[ADD]] to i32 -// CHECK13-NEXT: store i32 [[CONV21]], i32* [[I11]], align 4 -// CHECK13-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK13-NEXT: [[TMP18:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK13-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK13-NEXT: [[SUB22:%.*]] = sub nsw i32 [[TMP19]], 0 -// CHECK13-NEXT: [[DIV23:%.*]] = sdiv i32 [[SUB22]], 1 -// CHECK13-NEXT: [[MUL24:%.*]] = mul nsw i32 1, [[DIV23]] -// CHECK13-NEXT: [[CONV25:%.*]] = sext i32 [[MUL24]] to i64 -// CHECK13-NEXT: [[DIV26:%.*]] = sdiv i64 [[TMP18]], [[CONV25]] -// CHECK13-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK13-NEXT: [[SUB27:%.*]] = sub nsw i32 [[TMP20]], 0 -// CHECK13-NEXT: [[DIV28:%.*]] = sdiv i32 [[SUB27]], 1 -// CHECK13-NEXT: [[MUL29:%.*]] = mul nsw i32 1, [[DIV28]] -// CHECK13-NEXT: [[CONV30:%.*]] = sext i32 [[MUL29]] to i64 -// CHECK13-NEXT: [[MUL31:%.*]] = mul nsw i64 [[DIV26]], [[CONV30]] -// CHECK13-NEXT: [[SUB32:%.*]] = sub nsw i64 [[TMP17]], [[MUL31]] -// CHECK13-NEXT: [[MUL33:%.*]] = mul nsw i64 [[SUB32]], 1 -// CHECK13-NEXT: [[ADD34:%.*]] = add nsw i64 0, [[MUL33]] -// CHECK13-NEXT: [[CONV35:%.*]] = trunc i64 [[ADD34]] to i32 -// CHECK13-NEXT: store i32 [[CONV35]], i32* [[J12]], align 4 -// CHECK13-NEXT: [[TMP21:%.*]] = load i32, i32* [[I11]], align 4 -// CHECK13-NEXT: [[TMP22:%.*]] = load i32, i32* [[J12]], align 4 -// CHECK13-NEXT: [[ADD36:%.*]] = add nsw i32 [[TMP21]], [[TMP22]] -// CHECK13-NEXT: [[TMP23:%.*]] = load i32, i32* [[I11]], align 4 -// CHECK13-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP23]] -// CHECK13-NEXT: [[TMP24:%.*]] = load i32, i32* [[J12]], align 4 -// CHECK13-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP24]] -// CHECK13-NEXT: store i32 [[ADD36]], i32* [[ARRAYIDX37]], align 4 -// CHECK13-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK13: omp.body.continue: -// CHECK13-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK13: omp.inner.for.inc: -// CHECK13-NEXT: [[TMP25:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK13-NEXT: [[TMP26:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK13-NEXT: [[ADD38:%.*]] = add nsw i64 [[TMP25]], [[TMP26]] -// CHECK13-NEXT: store i64 [[ADD38]], i64* [[DOTOMP_IV]], align 8 -// CHECK13-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK13: omp.inner.for.end: -// CHECK13-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK13: omp.loop.exit: -// CHECK13-NEXT: [[TMP27:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK13-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP27]], align 4 -// CHECK13-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP28]]) -// CHECK13-NEXT: br label [[OMP_PRECOND_END]] -// CHECK13: omp.precond.end: -// CHECK13-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__7 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK1-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK1-NEXT: [[CONV2:%.*]] = trunc i64 [[TMP1]] to i32 +// CHECK1-NEXT: [[TMP2:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP2]] to i32 +// CHECK1-NEXT: store i32 [[CONV2]], i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[CONV4:%.*]] = sext i32 [[TMP6]] to i64 +// CHECK1-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV4]], [[TMP7]] +// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 +// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK1-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[DIV5:%.*]] = sdiv i32 [[TMP10]], 10 +// CHECK1-NEXT: [[MUL6:%.*]] = mul nsw i32 [[DIV5]], 10 +// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL6]] +// CHECK1-NEXT: [[MUL7:%.*]] = mul nsw i32 [[SUB]], 1 +// CHECK1-NEXT: [[ADD8:%.*]] = add nsw i32 0, [[MUL7]] +// CHECK1-NEXT: store i32 [[ADD8]], i32* [[J]], align 4 +// CHECK1-NEXT: store i32 10, i32* [[K]], align 4 +// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 +// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[MUL9:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] +// CHECK1-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP11]], [[MUL9]] +// CHECK1-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 +// CHECK1-NEXT: [[ADD11:%.*]] = add nsw i32 [[ADD10]], [[TMP14]] +// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 +// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP15]] to i64 +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 +// CHECK1-NEXT: [[IDXPROM12:%.*]] = sext i32 [[TMP16]] to i64 +// CHECK1-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i64 0, i64 [[IDXPROM12]] +// CHECK1-NEXT: store i32 [[ADD11]], i32* [[ARRAYIDX13]], align 4 +// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK1: omp.body.continue: +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK1-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK1-NEXT: ret void // // -// CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74 -// CHECK13-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { -// CHECK13-NEXT: entry: -// CHECK13-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK13-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 -// CHECK13-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK13-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK13-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK13-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 -// CHECK13-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK13-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK13-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK13-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK13: .execute: -// CHECK13-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK13-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK13-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 -// CHECK13-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK13-NEXT: [[TMP4:%.*]] = load i32*, i32** [[V_ADDR]], align 4 -// CHECK13-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK13-NEXT: call void @__omp_outlined__10(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32* [[TMP4]]) #[[ATTR2]] -// CHECK13-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK13: .omp.deinit: -// CHECK13-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK13-NEXT: br label [[DOTEXIT:%.*]] -// CHECK13: .exit: -// CHECK13-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l52 +// CHECK1-SAME: (i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK1-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK1: .execute: +// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK1-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK1-NEXT: call void @__omp_outlined__8(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [10 x [10 x i32]]* [[TMP0]]) #[[ATTR2]] +// CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK1: .omp.deinit: +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void // // -// CHECK13-LABEL: define {{[^@]+}}@__omp_outlined__10 -// CHECK13-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { -// CHECK13-NEXT: entry: -// CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK13-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK13-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 -// CHECK13-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 -// CHECK13-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK13-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK13-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK13-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK13-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 -// CHECK13-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK13-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK13-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK13-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK13-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK13-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK13-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK13-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK13-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK13-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK13-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK13-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK13: omp.precond.then: -// CHECK13-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK13-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK13-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK13-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK13-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK13-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK13-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK13-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 -// CHECK13-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK13-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK13-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK13-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] -// CHECK13-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK13: cond.true: -// CHECK13-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK13-NEXT: br label [[COND_END:%.*]] -// CHECK13: cond.false: -// CHECK13-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK13-NEXT: br label [[COND_END]] -// CHECK13: cond.end: -// CHECK13-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] -// CHECK13-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK13-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK13-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 -// CHECK13-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK13: omp.inner.for.cond: -// CHECK13-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK13-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK13-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 -// CHECK13-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] -// CHECK13-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK13: omp.inner.for.body: -// CHECK13-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK13-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK13-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK13-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 -// CHECK13-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK13-NEXT: [[TMP18:%.*]] = load i32*, i32** [[V_ADDR]], align 4 -// CHECK13-NEXT: [[TMP19:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK13-NEXT: [[TMP20:%.*]] = inttoptr i32 [[TMP14]] to i8* -// CHECK13-NEXT: store i8* [[TMP20]], i8** [[TMP19]], align 4 -// CHECK13-NEXT: [[TMP21:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK13-NEXT: [[TMP22:%.*]] = inttoptr i32 [[TMP15]] to i8* -// CHECK13-NEXT: store i8* [[TMP22]], i8** [[TMP21]], align 4 -// CHECK13-NEXT: [[TMP23:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK13-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP17]] to i8* -// CHECK13-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 -// CHECK13-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK13-NEXT: [[TMP26:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* -// CHECK13-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 -// CHECK13-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 -// CHECK13-NEXT: [[TMP28:%.*]] = bitcast i32* [[TMP18]] to i8* -// CHECK13-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 -// CHECK13-NEXT: [[TMP29:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK13-NEXT: [[TMP30:%.*]] = load i32, i32* [[TMP29]], align 4 -// CHECK13-NEXT: [[TMP31:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK13-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP30]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32*)* @__omp_outlined__11 to i8*), i8* null, i8** [[TMP31]], i32 5) -// CHECK13-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK13: omp.inner.for.inc: -// CHECK13-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK13-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK13-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP32]], [[TMP33]] -// CHECK13-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 -// CHECK13-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK13-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK13-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP34]], [[TMP35]] -// CHECK13-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK13-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK13-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK13-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP36]], [[TMP37]] -// CHECK13-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK13-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK13-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK13-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP38]], [[TMP39]] -// CHECK13-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] -// CHECK13: cond.true10: -// CHECK13-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK13-NEXT: br label [[COND_END12:%.*]] -// CHECK13: cond.false11: -// CHECK13-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK13-NEXT: br label [[COND_END12]] -// CHECK13: cond.end12: -// CHECK13-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP40]], [[COND_TRUE10]] ], [ [[TMP41]], [[COND_FALSE11]] ] -// CHECK13-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK13-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK13-NEXT: store i32 [[TMP42]], i32* [[DOTOMP_IV]], align 4 -// CHECK13-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK13: omp.inner.for.end: -// CHECK13-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK13: omp.loop.exit: -// CHECK13-NEXT: [[TMP43:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK13-NEXT: [[TMP44:%.*]] = load i32, i32* [[TMP43]], align 4 -// CHECK13-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP44]]) -// CHECK13-NEXT: br label [[OMP_PRECOND_END]] -// CHECK13: omp.precond.end: -// CHECK13-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__8 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I10:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[J11:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK1-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 +// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK1-NEXT: [[CONV4:%.*]] = sext i32 [[DIV]] to i64 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK1-NEXT: [[SUB5:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK1-NEXT: [[DIV6:%.*]] = sdiv i32 [[SUB5]], 1 +// CHECK1-NEXT: [[CONV7:%.*]] = sext i32 [[DIV6]] to i64 +// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV4]], [[CONV7]] +// CHECK1-NEXT: [[SUB8:%.*]] = sub nsw i64 [[MUL]], 1 +// CHECK1-NEXT: store i64 [[SUB8]], i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK1-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[J]], align 4 +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK1-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK1: land.lhs.true: +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK1-NEXT: [[CMP9:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK1-NEXT: br i1 [[CMP9]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] +// CHECK1: omp.precond.then: +// CHECK1-NEXT: store i64 0, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK1-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK1-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK1-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK1-NEXT: [[CONV12:%.*]] = zext i32 [[NVPTX_NUM_THREADS]] to i64 +// CHECK1-NEXT: [[TMP8:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB1]], i32 [[TMP9]], i32 91, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_COMB_LB]], i64* [[DOTOMP_COMB_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 [[CONV12]]) +// CHECK1-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK1-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK1-NEXT: [[CMP13:%.*]] = icmp sgt i64 [[TMP10]], [[TMP11]] +// CHECK1-NEXT: br i1 [[CMP13]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK1: cond.true: +// CHECK1-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK1-NEXT: br label [[COND_END:%.*]] +// CHECK1: cond.false: +// CHECK1-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK1-NEXT: br label [[COND_END]] +// CHECK1: cond.end: +// CHECK1-NEXT: [[COND:%.*]] = phi i64 [ [[TMP12]], [[COND_TRUE]] ], [ [[TMP13]], [[COND_FALSE]] ] +// CHECK1-NEXT: store i64 [[COND]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK1-NEXT: [[TMP14:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK1-NEXT: store i64 [[TMP14]], i64* [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: [[TMP16:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i64 [[TMP16]], 1 +// CHECK1-NEXT: [[CMP14:%.*]] = icmp slt i64 [[TMP15]], [[ADD]] +// CHECK1-NEXT: br i1 [[CMP14]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK1-NEXT: [[TMP18:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK1-NEXT: [[TMP19:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[CONV15:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK1-NEXT: store i32 [[TMP19]], i32* [[CONV15]], align 4 +// CHECK1-NEXT: [[TMP20:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK1-NEXT: [[TMP21:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP22:%.*]] = inttoptr i64 [[TMP17]] to i8* +// CHECK1-NEXT: store i8* [[TMP22]], i8** [[TMP21]], align 8 +// CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP24:%.*]] = inttoptr i64 [[TMP18]] to i8* +// CHECK1-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 8 +// CHECK1-NEXT: [[TMP25:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK1-NEXT: [[TMP26:%.*]] = inttoptr i64 [[TMP20]] to i8* +// CHECK1-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 8 +// CHECK1-NEXT: [[TMP27:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK1-NEXT: [[TMP28:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK1-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 8 +// CHECK1-NEXT: [[TMP29:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP30:%.*]] = load i32, i32* [[TMP29]], align 4 +// CHECK1-NEXT: [[TMP31:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP30]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [10 x [10 x i32]]*)* @__omp_outlined__9 to i8*), i8* null, i8** [[TMP31]], i64 4) +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP32:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: [[TMP33:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK1-NEXT: [[ADD16:%.*]] = add nsw i64 [[TMP32]], [[TMP33]] +// CHECK1-NEXT: store i64 [[ADD16]], i64* [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: [[TMP34:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK1-NEXT: [[TMP35:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK1-NEXT: [[ADD17:%.*]] = add nsw i64 [[TMP34]], [[TMP35]] +// CHECK1-NEXT: store i64 [[ADD17]], i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK1-NEXT: [[TMP36:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK1-NEXT: [[TMP37:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK1-NEXT: [[ADD18:%.*]] = add nsw i64 [[TMP36]], [[TMP37]] +// CHECK1-NEXT: store i64 [[ADD18]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK1-NEXT: [[TMP38:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK1-NEXT: [[TMP39:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK1-NEXT: [[CMP19:%.*]] = icmp sgt i64 [[TMP38]], [[TMP39]] +// CHECK1-NEXT: br i1 [[CMP19]], label [[COND_TRUE20:%.*]], label [[COND_FALSE21:%.*]] +// CHECK1: cond.true20: +// CHECK1-NEXT: [[TMP40:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK1-NEXT: br label [[COND_END22:%.*]] +// CHECK1: cond.false21: +// CHECK1-NEXT: [[TMP41:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK1-NEXT: br label [[COND_END22]] +// CHECK1: cond.end22: +// CHECK1-NEXT: [[COND23:%.*]] = phi i64 [ [[TMP40]], [[COND_TRUE20]] ], [ [[TMP41]], [[COND_FALSE21]] ] +// CHECK1-NEXT: store i64 [[COND23]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK1-NEXT: [[TMP42:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK1-NEXT: store i64 [[TMP42]], i64* [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: [[TMP43:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP44:%.*]] = load i32, i32* [[TMP43]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP44]]) +// CHECK1-NEXT: br label [[OMP_PRECOND_END]] +// CHECK1: omp.precond.end: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__9 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I10:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[J11:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK1-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 +// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK1-NEXT: [[CONV4:%.*]] = sext i32 [[DIV]] to i64 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK1-NEXT: [[SUB5:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK1-NEXT: [[DIV6:%.*]] = sdiv i32 [[SUB5]], 1 +// CHECK1-NEXT: [[CONV7:%.*]] = sext i32 [[DIV6]] to i64 +// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV4]], [[CONV7]] +// CHECK1-NEXT: [[SUB8:%.*]] = sub nsw i64 [[MUL]], 1 +// CHECK1-NEXT: store i64 [[SUB8]], i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK1-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[J]], align 4 +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK1-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK1: land.lhs.true: +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK1-NEXT: [[CMP9:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK1-NEXT: br i1 [[CMP9]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] +// CHECK1: omp.precond.then: +// CHECK1-NEXT: store i64 0, i64* [[DOTOMP_LB]], align 8 +// CHECK1-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK1-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_UB]], align 8 +// CHECK1-NEXT: [[TMP8:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK1-NEXT: [[TMP9:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[TMP8]], i64* [[DOTOMP_LB]], align 8 +// CHECK1-NEXT: store i64 [[TMP9]], i64* [[DOTOMP_UB]], align 8 +// CHECK1-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB2]], i32 [[TMP11]], i32 33, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_LB]], i64* [[DOTOMP_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 1) +// CHECK1-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTOMP_LB]], align 8 +// CHECK1-NEXT: store i64 [[TMP12]], i64* [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: [[TMP14:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: [[CMP12:%.*]] = icmp ule i64 [[TMP13]], [[TMP14]] +// CHECK1-NEXT: br i1 [[CMP12]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK1-NEXT: [[SUB13:%.*]] = sub nsw i32 [[TMP16]], 0 +// CHECK1-NEXT: [[DIV14:%.*]] = sdiv i32 [[SUB13]], 1 +// CHECK1-NEXT: [[MUL15:%.*]] = mul nsw i32 1, [[DIV14]] +// CHECK1-NEXT: [[CONV16:%.*]] = sext i32 [[MUL15]] to i64 +// CHECK1-NEXT: [[DIV17:%.*]] = sdiv i64 [[TMP15]], [[CONV16]] +// CHECK1-NEXT: [[MUL18:%.*]] = mul nsw i64 [[DIV17]], 1 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i64 0, [[MUL18]] +// CHECK1-NEXT: [[CONV19:%.*]] = trunc i64 [[ADD]] to i32 +// CHECK1-NEXT: store i32 [[CONV19]], i32* [[I10]], align 4 +// CHECK1-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: [[TMP18:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK1-NEXT: [[SUB20:%.*]] = sub nsw i32 [[TMP19]], 0 +// CHECK1-NEXT: [[DIV21:%.*]] = sdiv i32 [[SUB20]], 1 +// CHECK1-NEXT: [[MUL22:%.*]] = mul nsw i32 1, [[DIV21]] +// CHECK1-NEXT: [[CONV23:%.*]] = sext i32 [[MUL22]] to i64 +// CHECK1-NEXT: [[DIV24:%.*]] = sdiv i64 [[TMP18]], [[CONV23]] +// CHECK1-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK1-NEXT: [[SUB25:%.*]] = sub nsw i32 [[TMP20]], 0 +// CHECK1-NEXT: [[DIV26:%.*]] = sdiv i32 [[SUB25]], 1 +// CHECK1-NEXT: [[MUL27:%.*]] = mul nsw i32 1, [[DIV26]] +// CHECK1-NEXT: [[CONV28:%.*]] = sext i32 [[MUL27]] to i64 +// CHECK1-NEXT: [[MUL29:%.*]] = mul nsw i64 [[DIV24]], [[CONV28]] +// CHECK1-NEXT: [[SUB30:%.*]] = sub nsw i64 [[TMP17]], [[MUL29]] +// CHECK1-NEXT: [[MUL31:%.*]] = mul nsw i64 [[SUB30]], 1 +// CHECK1-NEXT: [[ADD32:%.*]] = add nsw i64 0, [[MUL31]] +// CHECK1-NEXT: [[CONV33:%.*]] = trunc i64 [[ADD32]] to i32 +// CHECK1-NEXT: store i32 [[CONV33]], i32* [[J11]], align 4 +// CHECK1-NEXT: [[TMP21:%.*]] = load i32, i32* [[I10]], align 4 +// CHECK1-NEXT: [[TMP22:%.*]] = load i32, i32* [[J11]], align 4 +// CHECK1-NEXT: [[ADD34:%.*]] = add nsw i32 [[TMP21]], [[TMP22]] +// CHECK1-NEXT: [[TMP23:%.*]] = load i32, i32* [[I10]], align 4 +// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP23]] to i64 +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK1-NEXT: [[TMP24:%.*]] = load i32, i32* [[J11]], align 4 +// CHECK1-NEXT: [[IDXPROM35:%.*]] = sext i32 [[TMP24]] to i64 +// CHECK1-NEXT: [[ARRAYIDX36:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i64 0, i64 [[IDXPROM35]] +// CHECK1-NEXT: store i32 [[ADD34]], i32* [[ARRAYIDX36]], align 4 +// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK1: omp.body.continue: +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP25:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: [[TMP26:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK1-NEXT: [[ADD37:%.*]] = add nsw i64 [[TMP25]], [[TMP26]] +// CHECK1-NEXT: store i64 [[ADD37]], i64* [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: [[TMP27:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP27]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP28]]) +// CHECK1-NEXT: br label [[OMP_PRECOND_END]] +// CHECK1: omp.precond.end: +// CHECK1-NEXT: ret void // // -// CHECK13-LABEL: define {{[^@]+}}@__omp_outlined__11 -// CHECK13-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { -// CHECK13-NEXT: entry: -// CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK13-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK13-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 -// CHECK13-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK13-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK13-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK13-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK13-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK13-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK13-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK13-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 -// CHECK13-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK13-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK13-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK13-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK13-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK13-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK13-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK13-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK13-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK13-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK13-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK13-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK13: omp.precond.then: -// CHECK13-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK13-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK13-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK13-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK13-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK13-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 -// CHECK13-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 -// CHECK13-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK13-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK13-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK13-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK13-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK13-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK13-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 -// CHECK13-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK13: omp.inner.for.cond: -// CHECK13-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK13-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK13-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] -// CHECK13-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK13: omp.inner.for.body: -// CHECK13-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK13-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 -// CHECK13-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK13-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 -// CHECK13-NEXT: [[TMP13:%.*]] = load i32*, i32** [[V_ADDR]], align 4 -// CHECK13-NEXT: [[TMP14:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK13-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TMP13]], i32 [[TMP14]] -// CHECK13-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -// CHECK13-NEXT: [[TMP16:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK13-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP16]] -// CHECK13-NEXT: store i32 [[TMP15]], i32* [[ARRAYIDX5]], align 4 -// CHECK13-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK13: omp.body.continue: -// CHECK13-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK13: omp.inner.for.inc: -// CHECK13-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK13-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK13-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] -// CHECK13-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 -// CHECK13-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK13: omp.inner.for.end: -// CHECK13-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK13: omp.loop.exit: -// CHECK13-NEXT: [[TMP19:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK13-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4 -// CHECK13-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP20]]) -// CHECK13-NEXT: br label [[OMP_PRECOND_END]] -// CHECK13: omp.precond.end: -// CHECK13-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59 +// CHECK1-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK1-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK1-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK1-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK1: .execute: +// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK1-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32*, i32** [[V_ADDR]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK1-NEXT: call void @__omp_outlined__10(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i32]* [[TMP0]], i32* [[TMP4]]) #[[ATTR2]] +// CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK1: .omp.deinit: +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void // // -// CHECK14-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43 -// CHECK14-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] { -// CHECK14-NEXT: entry: -// CHECK14-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK14-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK14-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK14-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK14-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 -// CHECK14-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK14-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK14-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK14-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK14: .execute: -// CHECK14-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) -// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK14-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 -// CHECK14-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK14-NEXT: [[TMP4:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK14-NEXT: store i32 [[TMP4]], i32* [[L_CASTED]], align 4 -// CHECK14-NEXT: [[TMP5:%.*]] = load i32, i32* [[L_CASTED]], align 4 -// CHECK14-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK14-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32 [[TMP5]]) #[[ATTR2:[0-9]+]] -// CHECK14-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK14: .omp.deinit: -// CHECK14-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK14-NEXT: br label [[DOTEXIT:%.*]] -// CHECK14: .exit: -// CHECK14-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__10 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK1-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 8 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK1-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK1-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK1-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK1-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK1: omp.precond.then: +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK1-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK1-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK1: cond.true: +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: br label [[COND_END:%.*]] +// CHECK1: cond.false: +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END]] +// CHECK1: cond.end: +// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK1-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK1-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK1-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 +// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[CONV6:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK1-NEXT: store i32 [[TMP18]], i32* [[CONV6]], align 4 +// CHECK1-NEXT: [[TMP19:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK1-NEXT: [[TMP20:%.*]] = load i32*, i32** [[V_ADDR]], align 8 +// CHECK1-NEXT: [[TMP21:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP22:%.*]] = inttoptr i64 [[TMP15]] to i8* +// CHECK1-NEXT: store i8* [[TMP22]], i8** [[TMP21]], align 8 +// CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP24:%.*]] = inttoptr i64 [[TMP17]] to i8* +// CHECK1-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 8 +// CHECK1-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK1-NEXT: [[TMP26:%.*]] = inttoptr i64 [[TMP19]] to i8* +// CHECK1-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 8 +// CHECK1-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK1-NEXT: [[TMP28:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK1-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 8 +// CHECK1-NEXT: [[TMP29:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 4 +// CHECK1-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP20]] to i8* +// CHECK1-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 8 +// CHECK1-NEXT: [[TMP31:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP32:%.*]] = load i32, i32* [[TMP31]], align 4 +// CHECK1-NEXT: [[TMP33:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP32]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i32]*, i32*)* @__omp_outlined__11 to i8*), i8* null, i8** [[TMP33]], i64 5) +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP34]], [[TMP35]] +// CHECK1-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP36]], [[TMP37]] +// CHECK1-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP38]], [[TMP39]] +// CHECK1-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP40]], [[TMP41]] +// CHECK1-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK1: cond.true11: +// CHECK1-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: br label [[COND_END13:%.*]] +// CHECK1: cond.false12: +// CHECK1-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END13]] +// CHECK1: cond.end13: +// CHECK1-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP42]], [[COND_TRUE11]] ], [ [[TMP43]], [[COND_FALSE12]] ] +// CHECK1-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP44]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: [[TMP45:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP46:%.*]] = load i32, i32* [[TMP45]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP46]]) +// CHECK1-NEXT: br label [[OMP_PRECOND_END]] +// CHECK1: omp.precond.end: +// CHECK1-NEXT: ret void // // -// CHECK14-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK14-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { -// CHECK14-NEXT: entry: -// CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK14-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK14-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[I4:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 -// CHECK14-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK14-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK14-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK14-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK14-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 -// CHECK14-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK14-NEXT: [[TMP1:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 4, i16 1) -// CHECK14-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to %struct._globalized_locals_ty* -// CHECK14-NEXT: [[L1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP2]], i32 0, i32 0 -// CHECK14-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK14-NEXT: store i32 [[TMP3]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK14-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK14-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP4]], 0 -// CHECK14-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK14-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK14-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK14-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK14-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK14-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] -// CHECK14-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK14: omp.precond.then: -// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK14-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK14-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK14-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK14-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK14-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP8]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) -// CHECK14-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK14-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]] -// CHECK14-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK14: cond.true: -// CHECK14-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK14-NEXT: br label [[COND_END:%.*]] -// CHECK14: cond.false: -// CHECK14-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: br label [[COND_END]] -// CHECK14: cond.end: -// CHECK14-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] -// CHECK14-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK14-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK14: omp.inner.for.cond: -// CHECK14-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK14-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP15]], 1 -// CHECK14-NEXT: [[CMP6:%.*]] = icmp slt i32 [[TMP14]], [[ADD]] -// CHECK14-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK14: omp.inner.for.body: -// CHECK14-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK14-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: [[TMP18:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK14-NEXT: store i32 [[TMP18]], i32* [[N_CASTED]], align 4 -// CHECK14-NEXT: [[TMP19:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK14-NEXT: [[TMP20:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK14-NEXT: store i32 [[TMP20]], i32* [[L_CASTED]], align 4 -// CHECK14-NEXT: [[TMP21:%.*]] = load i32, i32* [[L_CASTED]], align 4 -// CHECK14-NEXT: [[TMP22:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK14-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP16]] to i8* -// CHECK14-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 -// CHECK14-NEXT: [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK14-NEXT: [[TMP25:%.*]] = inttoptr i32 [[TMP17]] to i8* -// CHECK14-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 -// CHECK14-NEXT: [[TMP26:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK14-NEXT: [[TMP27:%.*]] = inttoptr i32 [[TMP19]] to i8* -// CHECK14-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 4 -// CHECK14-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK14-NEXT: [[TMP29:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* -// CHECK14-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 4 -// CHECK14-NEXT: [[TMP30:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 -// CHECK14-NEXT: [[TMP31:%.*]] = inttoptr i32 [[TMP21]] to i8* -// CHECK14-NEXT: store i8* [[TMP31]], i8** [[TMP30]], align 4 -// CHECK14-NEXT: [[TMP32:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK14-NEXT: [[TMP33:%.*]] = load i32, i32* [[TMP32]], align 4 -// CHECK14-NEXT: [[TMP34:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK14-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP33]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP34]], i32 5) -// CHECK14-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK14: omp.inner.for.inc: -// CHECK14-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK14-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] -// CHECK14-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK14-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK14-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP37]], [[TMP38]] -// CHECK14-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK14-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK14-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP39]], [[TMP40]] -// CHECK14-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK14-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]] -// CHECK14-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] -// CHECK14: cond.true11: -// CHECK14-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK14-NEXT: br label [[COND_END13:%.*]] -// CHECK14: cond.false12: -// CHECK14-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: br label [[COND_END13]] -// CHECK14: cond.end13: -// CHECK14-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP43]], [[COND_TRUE11]] ], [ [[TMP44]], [[COND_FALSE12]] ] -// CHECK14-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK14-NEXT: store i32 [[TMP45]], i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK14: omp.inner.for.end: -// CHECK14-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK14: omp.loop.exit: -// CHECK14-NEXT: [[TMP46:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK14-NEXT: [[TMP47:%.*]] = load i32, i32* [[TMP46]], align 4 -// CHECK14-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP47]]) -// CHECK14-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK14-NEXT: [[TMP49:%.*]] = icmp ne i32 [[TMP48]], 0 -// CHECK14-NEXT: br i1 [[TMP49]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK14: .omp.lastprivate.then: -// CHECK14-NEXT: [[TMP50:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK14-NEXT: store i32 [[TMP50]], i32* [[L_ADDR]], align 4 -// CHECK14-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK14: .omp.lastprivate.done: -// CHECK14-NEXT: br label [[OMP_PRECOND_END]] -// CHECK14: omp.precond.end: -// CHECK14-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP1]]) -// CHECK14-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__11 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK1-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK1-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK1-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK1-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK1-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK1: omp.precond.then: +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK1-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP5]] to i32 +// CHECK1-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP6]] to i32 +// CHECK1-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[CONV6:%.*]] = sext i32 [[TMP10]] to i64 +// CHECK1-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: [[CMP7:%.*]] = icmp ule i64 [[CONV6]], [[TMP11]] +// CHECK1-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK1-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 +// CHECK1-NEXT: [[TMP13:%.*]] = load i32*, i32** [[V_ADDR]], align 8 +// CHECK1-NEXT: [[TMP14:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP14]] to i64 +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TMP13]], i64 [[IDXPROM]] +// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK1-NEXT: [[IDXPROM8:%.*]] = sext i32 [[TMP16]] to i64 +// CHECK1-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM8]] +// CHECK1-NEXT: store i32 [[TMP15]], i32* [[ARRAYIDX9]], align 4 +// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK1: omp.body.continue: +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK1-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: [[TMP19:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP20]]) +// CHECK1-NEXT: br label [[OMP_PRECOND_END]] +// CHECK1: omp.precond.end: +// CHECK1-NEXT: ret void // // -// CHECK14-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK14-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { -// CHECK14-NEXT: entry: -// CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK14-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK14-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK14-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK14-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK14-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK14-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK14-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK14-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 -// CHECK14-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK14-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK14-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK14-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK14-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK14-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK14-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK14-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK14-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK14-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK14-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK14: omp.precond.then: -// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK14-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK14-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK14-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK14-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK14-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 -// CHECK14-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 -// CHECK14-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK14-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK14-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK14-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) -// CHECK14-NEXT: br label [[OMP_DISPATCH_COND:%.*]] -// CHECK14: omp.dispatch.cond: -// CHECK14-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK14-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK14-NEXT: [[CMP4:%.*]] = icmp ugt i32 [[TMP9]], [[TMP10]] -// CHECK14-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK14: cond.true: -// CHECK14-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK14-NEXT: br label [[COND_END:%.*]] -// CHECK14: cond.false: -// CHECK14-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK14-NEXT: br label [[COND_END]] -// CHECK14: cond.end: -// CHECK14-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] -// CHECK14-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 -// CHECK14-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK14-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK14-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] -// CHECK14-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] -// CHECK14: omp.dispatch.body: -// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK14: omp.inner.for.cond: -// CHECK14-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK14-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] -// CHECK14-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK14: omp.inner.for.body: -// CHECK14-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 -// CHECK14-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK14-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 -// CHECK14-NEXT: [[TMP19:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK14-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP19]] -// CHECK14-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 -// CHECK14-NEXT: [[TMP20:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK14-NEXT: store i32 [[TMP20]], i32* [[L_ADDR]], align 4 -// CHECK14-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK14: omp.body.continue: -// CHECK14-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK14: omp.inner.for.inc: -// CHECK14-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP21]], 1 -// CHECK14-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK14: omp.inner.for.end: -// CHECK14-NEXT: br label [[OMP_DISPATCH_INC:%.*]] -// CHECK14: omp.dispatch.inc: -// CHECK14-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK14-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK14-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK14-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_LB]], align 4 -// CHECK14-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK14-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK14-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] -// CHECK14-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_UB]], align 4 -// CHECK14-NEXT: br label [[OMP_DISPATCH_COND]] -// CHECK14: omp.dispatch.end: -// CHECK14-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK14-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 -// CHECK14-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) -// CHECK14-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK14-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 -// CHECK14-NEXT: br i1 [[TMP29]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK14: .omp.lastprivate.then: -// CHECK14-NEXT: [[TMP30:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK14-NEXT: store i32 [[TMP30]], i32* [[L_ADDR]], align 4 -// CHECK14-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK14: .omp.lastprivate.done: -// CHECK14-NEXT: br label [[OMP_PRECOND_END]] -// CHECK14: omp.precond.end: -// CHECK14-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l28 +// CHECK2-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK2-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[L_CASTED:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK2-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK2-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK2-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14:![0-9]+]] +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK2: .execute: +// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: [[CONV2:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK2-NEXT: store i32 [[TMP2]], i32* [[CONV2]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK2-NEXT: [[CONV3:%.*]] = bitcast i64* [[L_CASTED]] to i32* +// CHECK2-NEXT: store i32 [[TMP4]], i32* [[CONV3]], align 4 +// CHECK2-NEXT: [[TMP5:%.*]] = load i64, i64* [[L_CASTED]], align 8 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK2-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i32]* [[TMP0]], i64 [[TMP5]]) #[[ATTR2:[0-9]+]] +// CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK2: .omp.deinit: +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void // // -// CHECK14-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 -// CHECK14-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK14-NEXT: entry: -// CHECK14-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 -// CHECK14-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK14-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK14-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK14-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK14-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK14-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK14-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK14: .execute: -// CHECK14-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK14-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 -// CHECK14-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK14-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK14-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] -// CHECK14-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK14: .omp.deinit: -// CHECK14-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK14-NEXT: br label [[DOTEXIT:%.*]] -// CHECK14: .exit: -// CHECK14-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK2-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[L_CASTED:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 8 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK2-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK2-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK2-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* +// CHECK2-NEXT: [[L2:%.*]] = call i8* @__kmpc_alloc_shared(i64 4) +// CHECK2-NEXT: [[L_ON_STACK:%.*]] = bitcast i8* [[L2]] to i32* +// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK2-NEXT: [[SUB4:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK2-NEXT: store i32 [[SUB4]], i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK2: omp.precond.then: +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK2-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK2-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK2-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK2: cond.true: +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK2-NEXT: br label [[COND_END:%.*]] +// CHECK2: cond.false: +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END]] +// CHECK2: cond.end: +// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK2-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK2-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK2-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 +// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: [[CONV8:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK2-NEXT: store i32 [[TMP18]], i32* [[CONV8]], align 4 +// CHECK2-NEXT: [[TMP19:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK2-NEXT: [[TMP20:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK2-NEXT: [[CONV9:%.*]] = bitcast i64* [[L_CASTED]] to i32* +// CHECK2-NEXT: store i32 [[TMP20]], i32* [[CONV9]], align 4 +// CHECK2-NEXT: [[TMP21:%.*]] = load i64, i64* [[L_CASTED]], align 8 +// CHECK2-NEXT: [[TMP22:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK2-NEXT: [[TMP23:%.*]] = inttoptr i64 [[TMP15]] to i8* +// CHECK2-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 8 +// CHECK2-NEXT: [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK2-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP17]] to i8* +// CHECK2-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 8 +// CHECK2-NEXT: [[TMP26:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK2-NEXT: [[TMP27:%.*]] = inttoptr i64 [[TMP19]] to i8* +// CHECK2-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 8 +// CHECK2-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK2-NEXT: [[TMP29:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK2-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 8 +// CHECK2-NEXT: [[TMP30:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 4 +// CHECK2-NEXT: [[TMP31:%.*]] = inttoptr i64 [[TMP21]] to i8* +// CHECK2-NEXT: store i8* [[TMP31]], i8** [[TMP30]], align 8 +// CHECK2-NEXT: [[TMP32:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP33:%.*]] = load i32, i32* [[TMP32]], align 4 +// CHECK2-NEXT: [[TMP34:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP33]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i32]*, i64)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP34]], i64 5) +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] +// CHECK2-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP37]], [[TMP38]] +// CHECK2-NEXT: store i32 [[ADD11]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD12:%.*]] = add nsw i32 [[TMP39]], [[TMP40]] +// CHECK2-NEXT: store i32 [[ADD12]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK2-NEXT: [[CMP13:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]] +// CHECK2-NEXT: br i1 [[CMP13]], label [[COND_TRUE14:%.*]], label [[COND_FALSE15:%.*]] +// CHECK2: cond.true14: +// CHECK2-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK2-NEXT: br label [[COND_END16:%.*]] +// CHECK2: cond.false15: +// CHECK2-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END16]] +// CHECK2: cond.end16: +// CHECK2-NEXT: [[COND17:%.*]] = phi i32 [ [[TMP43]], [[COND_TRUE14]] ], [ [[TMP44]], [[COND_FALSE15]] ] +// CHECK2-NEXT: store i32 [[COND17]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP45]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK2: omp.loop.exit: +// CHECK2-NEXT: [[TMP46:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP47:%.*]] = load i32, i32* [[TMP46]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP47]]) +// CHECK2-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP49:%.*]] = icmp ne i32 [[TMP48]], 0 +// CHECK2-NEXT: br i1 [[TMP49]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK2: .omp.lastprivate.then: +// CHECK2-NEXT: [[TMP50:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK2-NEXT: store i32 [[TMP50]], i32* [[CONV1]], align 8 +// CHECK2-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK2: .omp.lastprivate.done: +// CHECK2-NEXT: br label [[OMP_PRECOND_END]] +// CHECK2: omp.precond.end: +// CHECK2-NEXT: call void @__kmpc_free_shared(i8* [[L2]]) +// CHECK2-NEXT: ret void // // -// CHECK14-LABEL: define {{[^@]+}}@__omp_outlined__2 -// CHECK14-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK14-NEXT: entry: -// CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK14-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 -// CHECK14-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 -// CHECK14-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK14-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK14-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK14-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK14-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK14-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK14-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK14-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK14-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK14-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK14-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK14-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK14-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK14-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK14-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK14: omp.precond.then: -// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK14-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK14-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK14-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK14-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK14-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 -// CHECK14-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK14-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK14-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] -// CHECK14-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK14: cond.true: -// CHECK14-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK14-NEXT: br label [[COND_END:%.*]] -// CHECK14: cond.false: -// CHECK14-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: br label [[COND_END]] -// CHECK14: cond.end: -// CHECK14-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] -// CHECK14-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK14-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK14: omp.inner.for.cond: -// CHECK14-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK14-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 -// CHECK14-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] -// CHECK14-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK14: omp.inner.for.body: -// CHECK14-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK14-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK14-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 -// CHECK14-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK14-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK14-NEXT: [[TMP19:%.*]] = inttoptr i32 [[TMP14]] to i8* -// CHECK14-NEXT: store i8* [[TMP19]], i8** [[TMP18]], align 4 -// CHECK14-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK14-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP15]] to i8* -// CHECK14-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 -// CHECK14-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK14-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP17]] to i8* -// CHECK14-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 -// CHECK14-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK14-NEXT: [[TMP25:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* -// CHECK14-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 -// CHECK14-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK14-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 -// CHECK14-NEXT: [[TMP28:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK14-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP27]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP28]], i32 4) -// CHECK14-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK14: omp.inner.for.inc: -// CHECK14-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK14-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP29]], [[TMP30]] -// CHECK14-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK14-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK14-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] -// CHECK14-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK14-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK14-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] -// CHECK14-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK14-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] -// CHECK14-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] -// CHECK14: cond.true10: -// CHECK14-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK14-NEXT: br label [[COND_END12:%.*]] -// CHECK14: cond.false11: -// CHECK14-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: br label [[COND_END12]] -// CHECK14: cond.end12: -// CHECK14-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP37]], [[COND_TRUE10]] ], [ [[TMP38]], [[COND_FALSE11]] ] -// CHECK14-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK14-NEXT: store i32 [[TMP39]], i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK14: omp.inner.for.end: -// CHECK14-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK14: omp.loop.exit: -// CHECK14-NEXT: [[TMP40:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK14-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 -// CHECK14-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]]) -// CHECK14-NEXT: br label [[OMP_PRECOND_END]] -// CHECK14: omp.precond.end: -// CHECK14-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK2-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I6:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK2-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK2-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK2-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* +// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK2-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK2-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK2: omp.precond.then: +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK2-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP5]] to i32 +// CHECK2-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: [[CONV5:%.*]] = trunc i64 [[TMP6]] to i32 +// CHECK2-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[CONV5]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) +// CHECK2-NEXT: br label [[OMP_DISPATCH_COND:%.*]] +// CHECK2: omp.dispatch.cond: +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[CONV7:%.*]] = sext i32 [[TMP9]] to i64 +// CHECK2-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: [[CMP8:%.*]] = icmp ugt i64 [[CONV7]], [[TMP10]] +// CHECK2-NEXT: br i1 [[CMP8]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK2: cond.true: +// CHECK2-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: br label [[COND_END:%.*]] +// CHECK2: cond.false: +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[CONV9:%.*]] = sext i32 [[TMP12]] to i64 +// CHECK2-NEXT: br label [[COND_END]] +// CHECK2: cond.end: +// CHECK2-NEXT: [[COND:%.*]] = phi i64 [ [[TMP11]], [[COND_TRUE]] ], [ [[CONV9]], [[COND_FALSE]] ] +// CHECK2-NEXT: [[CONV10:%.*]] = trunc i64 [[COND]] to i32 +// CHECK2-NEXT: store i32 [[CONV10]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[CMP11:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] +// CHECK2-NEXT: br i1 [[CMP11]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] +// CHECK2: omp.dispatch.body: +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[CMP12:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] +// CHECK2-NEXT: br i1 [[CMP12]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK2-NEXT: store i32 [[ADD]], i32* [[I6]], align 4 +// CHECK2-NEXT: [[TMP19:%.*]] = load i32, i32* [[I6]], align 4 +// CHECK2-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64 +// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK2-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 +// CHECK2-NEXT: [[TMP20:%.*]] = load i32, i32* [[I6]], align 4 +// CHECK2-NEXT: store i32 [[TMP20]], i32* [[CONV1]], align 8 +// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK2: omp.body.continue: +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[ADD13:%.*]] = add nsw i32 [[TMP21]], 1 +// CHECK2-NEXT: store i32 [[ADD13]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_DISPATCH_INC:%.*]] +// CHECK2: omp.dispatch.inc: +// CHECK2-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK2-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD15:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK2-NEXT: store i32 [[ADD15]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: br label [[OMP_DISPATCH_COND]] +// CHECK2: omp.dispatch.end: +// CHECK2-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) +// CHECK2-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 +// CHECK2-NEXT: br i1 [[TMP29]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK2: .omp.lastprivate.then: +// CHECK2-NEXT: [[TMP30:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK2-NEXT: store i32 [[TMP30]], i32* [[CONV1]], align 8 +// CHECK2-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK2: .omp.lastprivate.done: +// CHECK2-NEXT: br label [[OMP_PRECOND_END]] +// CHECK2: omp.precond.end: +// CHECK2-NEXT: ret void // // -// CHECK14-LABEL: define {{[^@]+}}@__omp_outlined__3 -// CHECK14-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK14-NEXT: entry: -// CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK14-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 -// CHECK14-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK14-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK14-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK14-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK14-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK14-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK14-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK14-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK14-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK14-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK14-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK14-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK14-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK14-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK14-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK14-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK14-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK14: omp.precond.then: -// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK14-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK14-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK14-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK14-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK14-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 -// CHECK14-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 -// CHECK14-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK14-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK14-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK14-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK14-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK14-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK14: omp.inner.for.cond: -// CHECK14-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK14-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] -// CHECK14-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK14: omp.inner.for.body: -// CHECK14-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 -// CHECK14-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK14-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 -// CHECK14-NEXT: [[TMP13:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK14-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i32 0, i32 [[TMP13]] -// CHECK14-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 -// CHECK14-NEXT: [[CONV:%.*]] = sext i16 [[TMP14]] to i32 -// CHECK14-NEXT: [[ADD5:%.*]] = add nsw i32 [[CONV]], 1 -// CHECK14-NEXT: [[CONV6:%.*]] = trunc i32 [[ADD5]] to i16 -// CHECK14-NEXT: store i16 [[CONV6]], i16* [[ARRAYIDX]], align 2 -// CHECK14-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK14: omp.body.continue: -// CHECK14-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK14: omp.inner.for.inc: -// CHECK14-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK14-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] -// CHECK14-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK14: omp.inner.for.end: -// CHECK14-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK14: omp.loop.exit: -// CHECK14-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK14-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 -// CHECK14-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) -// CHECK14-NEXT: br label [[OMP_PRECOND_END]] -// CHECK14: omp.precond.end: -// CHECK14-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34 +// CHECK2-SAME: (i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 +// CHECK2-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK2-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK2: .execute: +// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK2-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK2-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] +// CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK2: .omp.deinit: +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void // // -// CHECK14-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 -// CHECK14-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK14-NEXT: entry: -// CHECK14-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK14-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK14-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK14-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK14-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK14-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK14-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK14: .execute: -// CHECK14-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK14-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK14-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] -// CHECK14-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK14: .omp.deinit: -// CHECK14-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK14-NEXT: br label [[DOTEXIT:%.*]] -// CHECK14: .exit: -// CHECK14-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK2-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK2-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK2-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK2: omp.precond.then: +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK2-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK2-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK2: cond.true: +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: br label [[COND_END:%.*]] +// CHECK2: cond.false: +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END]] +// CHECK2: cond.end: +// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK2-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK2-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK2-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 +// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: [[CONV6:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK2-NEXT: store i32 [[TMP18]], i32* [[CONV6]], align 4 +// CHECK2-NEXT: [[TMP19:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK2-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK2-NEXT: [[TMP21:%.*]] = inttoptr i64 [[TMP15]] to i8* +// CHECK2-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 8 +// CHECK2-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK2-NEXT: [[TMP23:%.*]] = inttoptr i64 [[TMP17]] to i8* +// CHECK2-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 8 +// CHECK2-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK2-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP19]] to i8* +// CHECK2-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 8 +// CHECK2-NEXT: [[TMP26:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK2-NEXT: [[TMP27:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* +// CHECK2-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 8 +// CHECK2-NEXT: [[TMP28:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP29:%.*]] = load i32, i32* [[TMP28]], align 4 +// CHECK2-NEXT: [[TMP30:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP29]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP30]], i64 4) +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] +// CHECK2-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK2-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] +// CHECK2-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP37]], [[TMP38]] +// CHECK2-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK2: cond.true11: +// CHECK2-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: br label [[COND_END13:%.*]] +// CHECK2: cond.false12: +// CHECK2-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END13]] +// CHECK2: cond.end13: +// CHECK2-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP39]], [[COND_TRUE11]] ], [ [[TMP40]], [[COND_FALSE12]] ] +// CHECK2-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP41]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK2: omp.loop.exit: +// CHECK2-NEXT: [[TMP42:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP43:%.*]] = load i32, i32* [[TMP42]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP43]]) +// CHECK2-NEXT: br label [[OMP_PRECOND_END]] +// CHECK2: omp.precond.end: +// CHECK2-NEXT: ret void // // -// CHECK14-LABEL: define {{[^@]+}}@__omp_outlined__4 -// CHECK14-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK14-NEXT: entry: -// CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK14-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK14-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 -// CHECK14-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK14-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK14-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK14-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK14-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK14-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK14-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 -// CHECK14-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK14-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 -// CHECK14-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK14: cond.true: -// CHECK14-NEXT: br label [[COND_END:%.*]] -// CHECK14: cond.false: -// CHECK14-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: br label [[COND_END]] -// CHECK14: cond.end: -// CHECK14-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] -// CHECK14-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK14-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK14: omp.inner.for.cond: -// CHECK14-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 -// CHECK14-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK14: omp.inner.for.body: -// CHECK14-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK14-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK14-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP7]] to i8* -// CHECK14-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 -// CHECK14-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK14-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP8]] to i8* -// CHECK14-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 -// CHECK14-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK14-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* -// CHECK14-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 -// CHECK14-NEXT: [[TMP15:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK14-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP15]], i32 3) -// CHECK14-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK14: omp.inner.for.inc: -// CHECK14-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK14-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP17]] -// CHECK14-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK14-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK14-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] -// CHECK14-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK14-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK14-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] -// CHECK14-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP22]], 9 -// CHECK14-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] -// CHECK14: cond.true5: -// CHECK14-NEXT: br label [[COND_END7:%.*]] -// CHECK14: cond.false6: -// CHECK14-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: br label [[COND_END7]] -// CHECK14: cond.end7: -// CHECK14-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP23]], [[COND_FALSE6]] ] -// CHECK14-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK14-NEXT: store i32 [[TMP24]], i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK14: omp.inner.for.end: -// CHECK14-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK14: omp.loop.exit: -// CHECK14-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) -// CHECK14-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK2-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK2-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK2-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK2: omp.precond.then: +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK2-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP5]] to i32 +// CHECK2-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP6]] to i32 +// CHECK2-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[CONV6:%.*]] = sext i32 [[TMP10]] to i64 +// CHECK2-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: [[CMP7:%.*]] = icmp ule i64 [[CONV6]], [[TMP11]] +// CHECK2-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK2-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 +// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK2-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP13]] to i64 +// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK2-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +// CHECK2-NEXT: [[CONV8:%.*]] = sext i16 [[TMP14]] to i32 +// CHECK2-NEXT: [[ADD9:%.*]] = add nsw i32 [[CONV8]], 1 +// CHECK2-NEXT: [[CONV10:%.*]] = trunc i32 [[ADD9]] to i16 +// CHECK2-NEXT: store i16 [[CONV10]], i16* [[ARRAYIDX]], align 2 +// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK2: omp.body.continue: +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] +// CHECK2-NEXT: store i32 [[ADD11]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK2: omp.loop.exit: +// CHECK2-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) +// CHECK2-NEXT: br label [[OMP_PRECOND_END]] +// CHECK2: omp.precond.end: +// CHECK2-NEXT: ret void // // -// CHECK14-LABEL: define {{[^@]+}}@__omp_outlined__5 -// CHECK14-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK14-NEXT: entry: -// CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK14-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK14-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK14-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK14-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK14-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK14-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK14-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK14-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 -// CHECK14-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK14-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 -// CHECK14-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 -// CHECK14-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK14-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK14-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 -// CHECK14-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK14-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK14-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK14: omp.inner.for.cond: -// CHECK14-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK14-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] -// CHECK14-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK14: omp.inner.for.body: -// CHECK14-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 -// CHECK14-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK14-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK14-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 -// CHECK14-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 [[TMP9]] -// CHECK14-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -// CHECK14-NEXT: [[ADD1:%.*]] = add nsw i32 [[TMP10]], 1 -// CHECK14-NEXT: store i32 [[ADD1]], i32* [[ARRAYIDX]], align 4 -// CHECK14-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK14: omp.body.continue: -// CHECK14-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK14: omp.inner.for.inc: -// CHECK14-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK14-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] -// CHECK14-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK14: omp.inner.for.end: -// CHECK14-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK14: omp.loop.exit: -// CHECK14-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) -// CHECK14-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l39 +// CHECK2-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK2: .execute: +// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK2-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] +// CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK2: .omp.deinit: +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 8 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK2-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK2-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 +// CHECK2-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK2: cond.true: +// CHECK2-NEXT: br label [[COND_END:%.*]] +// CHECK2: cond.false: +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END]] +// CHECK2: cond.end: +// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK2-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 +// CHECK2-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK2-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP8]] to i8* +// CHECK2-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 8 +// CHECK2-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK2-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP10]] to i8* +// CHECK2-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 +// CHECK2-NEXT: [[TMP15:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK2-NEXT: [[TMP16:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK2-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 +// CHECK2-NEXT: [[TMP17:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP17]], i64 3) +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] +// CHECK2-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK2-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK2-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP24]], 9 +// CHECK2-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] +// CHECK2: cond.true5: +// CHECK2-NEXT: br label [[COND_END7:%.*]] +// CHECK2: cond.false6: +// CHECK2-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END7]] +// CHECK2: cond.end7: +// CHECK2-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP25]], [[COND_FALSE6]] ] +// CHECK2-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP26]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK2: omp.loop.exit: +// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__5 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK2-NEXT: [[CONV:%.*]] = trunc i64 [[TMP1]] to i32 +// CHECK2-NEXT: [[TMP2:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: [[CONV1:%.*]] = trunc i64 [[TMP2]] to i32 +// CHECK2-NEXT: store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[CONV1]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[CONV2:%.*]] = sext i32 [[TMP6]] to i64 +// CHECK2-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV2]], [[TMP7]] +// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK2-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 +// CHECK2-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP9]] to i64 +// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK2-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK2-NEXT: store i32 [[ADD3]], i32* [[ARRAYIDX]], align 4 +// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK2: omp.body.continue: +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] +// CHECK2-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK2: omp.loop.exit: +// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK2-NEXT: ret void // // -// CHECK14-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59 -// CHECK14-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { -// CHECK14-NEXT: entry: -// CHECK14-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK14-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK14-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK14-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 -// CHECK14-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK14-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK14-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK14-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK14: .execute: -// CHECK14-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[F_ADDR]], align 4 -// CHECK14-NEXT: store i32 [[TMP2]], i32* [[F_CASTED]], align 4 -// CHECK14-NEXT: [[TMP3:%.*]] = load i32, i32* [[F_CASTED]], align 4 -// CHECK14-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK14-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i32 [[TMP3]]) #[[ATTR2]] -// CHECK14-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK14: .omp.deinit: -// CHECK14-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK14-NEXT: br label [[DOTEXIT:%.*]] -// CHECK14: .exit: -// CHECK14-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44 +// CHECK2-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK2-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK2-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 +// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK2: .execute: +// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: [[CONV1:%.*]] = bitcast i64* [[F_CASTED]] to i32* +// CHECK2-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i64, i64* [[F_CASTED]], align 8 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK2-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i64 [[TMP3]]) #[[ATTR2]] +// CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK2: .omp.deinit: +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__6 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK2-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK2-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK2-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 +// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK2-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 +// CHECK2-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK2: cond.true: +// CHECK2-NEXT: br label [[COND_END:%.*]] +// CHECK2: cond.false: +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END]] +// CHECK2: cond.end: +// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK2-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 +// CHECK2-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: [[CONV3:%.*]] = bitcast i64* [[F_CASTED]] to i32* +// CHECK2-NEXT: store i32 [[TMP11]], i32* [[CONV3]], align 4 +// CHECK2-NEXT: [[TMP12:%.*]] = load i64, i64* [[F_CASTED]], align 8 +// CHECK2-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK2-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP8]] to i8* +// CHECK2-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 +// CHECK2-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK2-NEXT: [[TMP16:%.*]] = inttoptr i64 [[TMP10]] to i8* +// CHECK2-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 +// CHECK2-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK2-NEXT: [[TMP18:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK2-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 8 +// CHECK2-NEXT: [[TMP19:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK2-NEXT: [[TMP20:%.*]] = inttoptr i64 [[TMP12]] to i8* +// CHECK2-NEXT: store i8* [[TMP20]], i8** [[TMP19]], align 8 +// CHECK2-NEXT: [[TMP21:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, [10 x [10 x i32]]*, i64)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP21]], i64 4) +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK2-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK2-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP26]], [[TMP27]] +// CHECK2-NEXT: store i32 [[ADD5]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP28]], 99 +// CHECK2-NEXT: br i1 [[CMP6]], label [[COND_TRUE7:%.*]], label [[COND_FALSE8:%.*]] +// CHECK2: cond.true7: +// CHECK2-NEXT: br label [[COND_END9:%.*]] +// CHECK2: cond.false8: +// CHECK2-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END9]] +// CHECK2: cond.end9: +// CHECK2-NEXT: [[COND10:%.*]] = phi i32 [ 99, [[COND_TRUE7]] ], [ [[TMP29]], [[COND_FALSE8]] ] +// CHECK2-NEXT: store i32 [[COND10]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP30]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK2: omp.loop.exit: +// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__7 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK2-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK2-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 +// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK2-NEXT: [[CONV2:%.*]] = trunc i64 [[TMP1]] to i32 +// CHECK2-NEXT: [[TMP2:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP2]] to i32 +// CHECK2-NEXT: store i32 [[CONV2]], i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[CONV4:%.*]] = sext i32 [[TMP6]] to i64 +// CHECK2-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV4]], [[TMP7]] +// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 +// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK2-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[DIV5:%.*]] = sdiv i32 [[TMP10]], 10 +// CHECK2-NEXT: [[MUL6:%.*]] = mul nsw i32 [[DIV5]], 10 +// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL6]] +// CHECK2-NEXT: [[MUL7:%.*]] = mul nsw i32 [[SUB]], 1 +// CHECK2-NEXT: [[ADD8:%.*]] = add nsw i32 0, [[MUL7]] +// CHECK2-NEXT: store i32 [[ADD8]], i32* [[J]], align 4 +// CHECK2-NEXT: store i32 10, i32* [[K]], align 4 +// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 +// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: [[MUL9:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] +// CHECK2-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP11]], [[MUL9]] +// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 +// CHECK2-NEXT: [[ADD11:%.*]] = add nsw i32 [[ADD10]], [[TMP14]] +// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 +// CHECK2-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP15]] to i64 +// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 +// CHECK2-NEXT: [[IDXPROM12:%.*]] = sext i32 [[TMP16]] to i64 +// CHECK2-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i64 0, i64 [[IDXPROM12]] +// CHECK2-NEXT: store i32 [[ADD11]], i32* [[ARRAYIDX13]], align 4 +// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK2: omp.body.continue: +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK2-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK2: omp.loop.exit: +// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK2-NEXT: ret void // // -// CHECK14-LABEL: define {{[^@]+}}@__omp_outlined__6 -// CHECK14-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { -// CHECK14-NEXT: entry: -// CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK14-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK14-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[K:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 -// CHECK14-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK14-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK14-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK14-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 -// CHECK14-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK14-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK14-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK14-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 -// CHECK14-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK14-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 -// CHECK14-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK14: cond.true: -// CHECK14-NEXT: br label [[COND_END:%.*]] -// CHECK14: cond.false: -// CHECK14-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: br label [[COND_END]] -// CHECK14: cond.end: -// CHECK14-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] -// CHECK14-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK14-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK14: omp.inner.for.cond: -// CHECK14-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 -// CHECK14-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK14: omp.inner.for.body: -// CHECK14-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK14-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: [[TMP9:%.*]] = load i32, i32* [[F_ADDR]], align 4 -// CHECK14-NEXT: store i32 [[TMP9]], i32* [[F_CASTED]], align 4 -// CHECK14-NEXT: [[TMP10:%.*]] = load i32, i32* [[F_CASTED]], align 4 -// CHECK14-NEXT: [[TMP11:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK14-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP7]] to i8* -// CHECK14-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 -// CHECK14-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK14-NEXT: [[TMP14:%.*]] = inttoptr i32 [[TMP8]] to i8* -// CHECK14-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 -// CHECK14-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK14-NEXT: [[TMP16:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* -// CHECK14-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 4 -// CHECK14-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK14-NEXT: [[TMP18:%.*]] = inttoptr i32 [[TMP10]] to i8* -// CHECK14-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 4 -// CHECK14-NEXT: [[TMP19:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK14-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x [10 x i32]]*, i32)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP19]], i32 4) -// CHECK14-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK14: omp.inner.for.inc: -// CHECK14-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK14-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] -// CHECK14-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK14-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK14-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK14-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK14-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK14-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] -// CHECK14-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP26]], 99 -// CHECK14-NEXT: br i1 [[CMP5]], label [[COND_TRUE6:%.*]], label [[COND_FALSE7:%.*]] -// CHECK14: cond.true6: -// CHECK14-NEXT: br label [[COND_END8:%.*]] -// CHECK14: cond.false7: -// CHECK14-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: br label [[COND_END8]] -// CHECK14: cond.end8: -// CHECK14-NEXT: [[COND9:%.*]] = phi i32 [ 99, [[COND_TRUE6]] ], [ [[TMP27]], [[COND_FALSE7]] ] -// CHECK14-NEXT: store i32 [[COND9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK14-NEXT: store i32 [[TMP28]], i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK14: omp.inner.for.end: -// CHECK14-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK14: omp.loop.exit: -// CHECK14-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) -// CHECK14-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l52 +// CHECK2-SAME: (i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK2-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK2-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK2: .execute: +// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK2-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK2-NEXT: call void @__omp_outlined__8(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [10 x [10 x i32]]* [[TMP0]]) #[[ATTR2]] +// CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK2: .omp.deinit: +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void // // -// CHECK14-LABEL: define {{[^@]+}}@__omp_outlined__7 -// CHECK14-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { -// CHECK14-NEXT: entry: -// CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK14-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK14-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[K:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK14-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK14-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK14-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK14-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK14-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 -// CHECK14-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK14-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 -// CHECK14-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK14-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 -// CHECK14-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 -// CHECK14-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK14-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK14-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 -// CHECK14-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK14-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK14-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK14: omp.inner.for.cond: -// CHECK14-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK14-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] -// CHECK14-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK14: omp.inner.for.body: -// CHECK14-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 -// CHECK14-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 -// CHECK14-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK14-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK14-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: [[DIV2:%.*]] = sdiv i32 [[TMP10]], 10 -// CHECK14-NEXT: [[MUL3:%.*]] = mul nsw i32 [[DIV2]], 10 -// CHECK14-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL3]] -// CHECK14-NEXT: [[MUL4:%.*]] = mul nsw i32 [[SUB]], 1 -// CHECK14-NEXT: [[ADD5:%.*]] = add nsw i32 0, [[MUL4]] -// CHECK14-NEXT: store i32 [[ADD5]], i32* [[J]], align 4 -// CHECK14-NEXT: store i32 10, i32* [[K]], align 4 -// CHECK14-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 -// CHECK14-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 -// CHECK14-NEXT: [[TMP13:%.*]] = load i32, i32* [[F_ADDR]], align 4 -// CHECK14-NEXT: [[MUL6:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] -// CHECK14-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP11]], [[MUL6]] -// CHECK14-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 -// CHECK14-NEXT: [[ADD8:%.*]] = add nsw i32 [[ADD7]], [[TMP14]] -// CHECK14-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 -// CHECK14-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP15]] -// CHECK14-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 -// CHECK14-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP16]] -// CHECK14-NEXT: store i32 [[ADD8]], i32* [[ARRAYIDX9]], align 4 -// CHECK14-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK14: omp.body.continue: -// CHECK14-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK14: omp.inner.for.inc: -// CHECK14-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK14-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] -// CHECK14-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK14: omp.inner.for.end: -// CHECK14-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK14: omp.loop.exit: -// CHECK14-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) -// CHECK14-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__8 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I8:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[J9:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK2-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 +// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK2-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 +// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], [[DIV5]] +// CHECK2-NEXT: [[SUB6:%.*]] = sub nsw i32 [[MUL]], 1 +// CHECK2-NEXT: store i32 [[SUB6]], i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[J]], align 4 +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK2-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK2: land.lhs.true: +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: [[CMP7:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK2-NEXT: br i1 [[CMP7]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] +// CHECK2: omp.precond.then: +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK2-NEXT: store i32 [[TMP7]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK2-NEXT: [[TMP8:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP9]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK2-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP10]], [[TMP11]] +// CHECK2-NEXT: br i1 [[CMP10]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK2: cond.true: +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK2-NEXT: br label [[COND_END:%.*]] +// CHECK2: cond.false: +// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END]] +// CHECK2: cond.end: +// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ [[TMP12]], [[COND_TRUE]] ], [ [[TMP13]], [[COND_FALSE]] ] +// CHECK2-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP14]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], 1 +// CHECK2-NEXT: [[CMP11:%.*]] = icmp slt i32 [[TMP15]], [[ADD]] +// CHECK2-NEXT: br i1 [[CMP11]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP18:%.*]] = zext i32 [[TMP17]] to i64 +// CHECK2-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP20:%.*]] = zext i32 [[TMP19]] to i64 +// CHECK2-NEXT: [[TMP21:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: [[CONV12:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK2-NEXT: store i32 [[TMP21]], i32* [[CONV12]], align 4 +// CHECK2-NEXT: [[TMP22:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK2-NEXT: [[TMP23:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK2-NEXT: [[TMP24:%.*]] = inttoptr i64 [[TMP18]] to i8* +// CHECK2-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 8 +// CHECK2-NEXT: [[TMP25:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK2-NEXT: [[TMP26:%.*]] = inttoptr i64 [[TMP20]] to i8* +// CHECK2-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 8 +// CHECK2-NEXT: [[TMP27:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK2-NEXT: [[TMP28:%.*]] = inttoptr i64 [[TMP22]] to i8* +// CHECK2-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 8 +// CHECK2-NEXT: [[TMP29:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK2-NEXT: [[TMP30:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK2-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 8 +// CHECK2-NEXT: [[TMP31:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP32:%.*]] = load i32, i32* [[TMP31]], align 4 +// CHECK2-NEXT: [[TMP33:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP32]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [10 x [10 x i32]]*)* @__omp_outlined__9 to i8*), i8* null, i8** [[TMP33]], i64 4) +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD13:%.*]] = add nsw i32 [[TMP34]], [[TMP35]] +// CHECK2-NEXT: store i32 [[ADD13]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP36]], [[TMP37]] +// CHECK2-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD15:%.*]] = add nsw i32 [[TMP38]], [[TMP39]] +// CHECK2-NEXT: store i32 [[ADD15]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK2-NEXT: [[CMP16:%.*]] = icmp sgt i32 [[TMP40]], [[TMP41]] +// CHECK2-NEXT: br i1 [[CMP16]], label [[COND_TRUE17:%.*]], label [[COND_FALSE18:%.*]] +// CHECK2: cond.true17: +// CHECK2-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK2-NEXT: br label [[COND_END19:%.*]] +// CHECK2: cond.false18: +// CHECK2-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END19]] +// CHECK2: cond.end19: +// CHECK2-NEXT: [[COND20:%.*]] = phi i32 [ [[TMP42]], [[COND_TRUE17]] ], [ [[TMP43]], [[COND_FALSE18]] ] +// CHECK2-NEXT: store i32 [[COND20]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP44]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK2: omp.loop.exit: +// CHECK2-NEXT: [[TMP45:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP46:%.*]] = load i32, i32* [[TMP45]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP46]]) +// CHECK2-NEXT: br label [[OMP_PRECOND_END]] +// CHECK2: omp.precond.end: +// CHECK2-NEXT: ret void // // -// CHECK14-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l67 -// CHECK14-SAME: (i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { -// CHECK14-NEXT: entry: -// CHECK14-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK14-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK14-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK14-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK14-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK14-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK14-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK14-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK14: .execute: -// CHECK14-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK14-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 -// CHECK14-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK14-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK14-NEXT: call void @__omp_outlined__8(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [10 x [10 x i32]]* [[TMP0]]) #[[ATTR2]] -// CHECK14-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK14: .omp.deinit: -// CHECK14-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK14-NEXT: br label [[DOTEXIT:%.*]] -// CHECK14: .exit: -// CHECK14-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__9 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I10:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[J11:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK2-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 +// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK2-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 +// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], [[DIV5]] +// CHECK2-NEXT: [[SUB6:%.*]] = sub nsw i32 [[MUL]], 1 +// CHECK2-NEXT: store i32 [[SUB6]], i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[J]], align 4 +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK2-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK2: land.lhs.true: +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: [[CMP7:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK2-NEXT: br i1 [[CMP7]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] +// CHECK2: omp.precond.then: +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK2-NEXT: store i32 [[TMP7]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK2-NEXT: [[CONV8:%.*]] = trunc i64 [[TMP8]] to i32 +// CHECK2-NEXT: [[TMP9:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: [[CONV9:%.*]] = trunc i64 [[TMP9]] to i32 +// CHECK2-NEXT: store i32 [[CONV8]], i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[CONV9]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP11]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP12]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[CONV12:%.*]] = sext i32 [[TMP13]] to i64 +// CHECK2-NEXT: [[TMP14:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: [[CMP13:%.*]] = icmp ule i64 [[CONV12]], [[TMP14]] +// CHECK2-NEXT: br i1 [[CMP13]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: [[SUB14:%.*]] = sub nsw i32 [[TMP16]], 0 +// CHECK2-NEXT: [[DIV15:%.*]] = sdiv i32 [[SUB14]], 1 +// CHECK2-NEXT: [[MUL16:%.*]] = mul nsw i32 1, [[DIV15]] +// CHECK2-NEXT: [[DIV17:%.*]] = sdiv i32 [[TMP15]], [[MUL16]] +// CHECK2-NEXT: [[MUL18:%.*]] = mul nsw i32 [[DIV17]], 1 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL18]] +// CHECK2-NEXT: store i32 [[ADD]], i32* [[I10]], align 4 +// CHECK2-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: [[SUB19:%.*]] = sub nsw i32 [[TMP19]], 0 +// CHECK2-NEXT: [[DIV20:%.*]] = sdiv i32 [[SUB19]], 1 +// CHECK2-NEXT: [[MUL21:%.*]] = mul nsw i32 1, [[DIV20]] +// CHECK2-NEXT: [[DIV22:%.*]] = sdiv i32 [[TMP18]], [[MUL21]] +// CHECK2-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: [[SUB23:%.*]] = sub nsw i32 [[TMP20]], 0 +// CHECK2-NEXT: [[DIV24:%.*]] = sdiv i32 [[SUB23]], 1 +// CHECK2-NEXT: [[MUL25:%.*]] = mul nsw i32 1, [[DIV24]] +// CHECK2-NEXT: [[MUL26:%.*]] = mul nsw i32 [[DIV22]], [[MUL25]] +// CHECK2-NEXT: [[SUB27:%.*]] = sub nsw i32 [[TMP17]], [[MUL26]] +// CHECK2-NEXT: [[MUL28:%.*]] = mul nsw i32 [[SUB27]], 1 +// CHECK2-NEXT: [[ADD29:%.*]] = add nsw i32 0, [[MUL28]] +// CHECK2-NEXT: store i32 [[ADD29]], i32* [[J11]], align 4 +// CHECK2-NEXT: [[TMP21:%.*]] = load i32, i32* [[I10]], align 4 +// CHECK2-NEXT: [[TMP22:%.*]] = load i32, i32* [[J11]], align 4 +// CHECK2-NEXT: [[ADD30:%.*]] = add nsw i32 [[TMP21]], [[TMP22]] +// CHECK2-NEXT: [[TMP23:%.*]] = load i32, i32* [[I10]], align 4 +// CHECK2-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP23]] to i64 +// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK2-NEXT: [[TMP24:%.*]] = load i32, i32* [[J11]], align 4 +// CHECK2-NEXT: [[IDXPROM31:%.*]] = sext i32 [[TMP24]] to i64 +// CHECK2-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i64 0, i64 [[IDXPROM31]] +// CHECK2-NEXT: store i32 [[ADD30]], i32* [[ARRAYIDX32]], align 4 +// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK2: omp.body.continue: +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD33:%.*]] = add nsw i32 [[TMP25]], [[TMP26]] +// CHECK2-NEXT: store i32 [[ADD33]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK2: omp.loop.exit: +// CHECK2-NEXT: [[TMP27:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP27]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP28]]) +// CHECK2-NEXT: br label [[OMP_PRECOND_END]] +// CHECK2: omp.precond.end: +// CHECK2-NEXT: ret void // // -// CHECK14-LABEL: define {{[^@]+}}@__omp_outlined__8 -// CHECK14-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { -// CHECK14-NEXT: entry: -// CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK14-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK14-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 -// CHECK14-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 -// CHECK14-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i64, align 8 -// CHECK14-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i64, align 8 -// CHECK14-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 -// CHECK14-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[I9:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[J10:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 -// CHECK14-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK14-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK14-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK14-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK14-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK14-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK14-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK14-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK14-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK14-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 -// CHECK14-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK14-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64 -// CHECK14-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK14-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 -// CHECK14-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 -// CHECK14-NEXT: [[CONV6:%.*]] = sext i32 [[DIV5]] to i64 -// CHECK14-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV6]] -// CHECK14-NEXT: [[SUB7:%.*]] = sub nsw i64 [[MUL]], 1 -// CHECK14-NEXT: store i64 [[SUB7]], i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK14-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK14-NEXT: store i32 0, i32* [[J]], align 4 -// CHECK14-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK14-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] -// CHECK14-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK14: land.lhs.true: -// CHECK14-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK14-NEXT: [[CMP8:%.*]] = icmp slt i32 0, [[TMP6]] -// CHECK14-NEXT: br i1 [[CMP8]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] -// CHECK14: omp.precond.then: -// CHECK14-NEXT: store i64 0, i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK14-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK14-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK14-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK14-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK14-NEXT: [[CONV11:%.*]] = zext i32 [[NVPTX_NUM_THREADS]] to i64 -// CHECK14-NEXT: [[TMP8:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK14-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4 -// CHECK14-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB1]], i32 [[TMP9]], i32 91, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_COMB_LB]], i64* [[DOTOMP_COMB_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 [[CONV11]]) -// CHECK14-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK14-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK14-NEXT: [[CMP12:%.*]] = icmp sgt i64 [[TMP10]], [[TMP11]] -// CHECK14-NEXT: br i1 [[CMP12]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK14: cond.true: -// CHECK14-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK14-NEXT: br label [[COND_END:%.*]] -// CHECK14: cond.false: -// CHECK14-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK14-NEXT: br label [[COND_END]] -// CHECK14: cond.end: -// CHECK14-NEXT: [[COND:%.*]] = phi i64 [ [[TMP12]], [[COND_TRUE]] ], [ [[TMP13]], [[COND_FALSE]] ] -// CHECK14-NEXT: store i64 [[COND]], i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK14-NEXT: [[TMP14:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK14-NEXT: store i64 [[TMP14]], i64* [[DOTOMP_IV]], align 8 -// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK14: omp.inner.for.cond: -// CHECK14-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK14-NEXT: [[TMP16:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK14-NEXT: [[ADD:%.*]] = add nsw i64 [[TMP16]], 1 -// CHECK14-NEXT: [[CMP13:%.*]] = icmp slt i64 [[TMP15]], [[ADD]] -// CHECK14-NEXT: br i1 [[CMP13]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK14: omp.inner.for.body: -// CHECK14-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK14-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32 -// CHECK14-NEXT: [[TMP19:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK14-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32 -// CHECK14-NEXT: [[TMP21:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK14-NEXT: store i32 [[TMP21]], i32* [[N_CASTED]], align 4 -// CHECK14-NEXT: [[TMP22:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK14-NEXT: [[TMP23:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK14-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP18]] to i8* -// CHECK14-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 -// CHECK14-NEXT: [[TMP25:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK14-NEXT: [[TMP26:%.*]] = inttoptr i32 [[TMP20]] to i8* -// CHECK14-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 -// CHECK14-NEXT: [[TMP27:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK14-NEXT: [[TMP28:%.*]] = inttoptr i32 [[TMP22]] to i8* -// CHECK14-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 -// CHECK14-NEXT: [[TMP29:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK14-NEXT: [[TMP30:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* -// CHECK14-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 4 -// CHECK14-NEXT: [[TMP31:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK14-NEXT: [[TMP32:%.*]] = load i32, i32* [[TMP31]], align 4 -// CHECK14-NEXT: [[TMP33:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK14-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP32]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [10 x [10 x i32]]*)* @__omp_outlined__9 to i8*), i8* null, i8** [[TMP33]], i32 4) -// CHECK14-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK14: omp.inner.for.inc: -// CHECK14-NEXT: [[TMP34:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK14-NEXT: [[TMP35:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK14-NEXT: [[ADD14:%.*]] = add nsw i64 [[TMP34]], [[TMP35]] -// CHECK14-NEXT: store i64 [[ADD14]], i64* [[DOTOMP_IV]], align 8 -// CHECK14-NEXT: [[TMP36:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK14-NEXT: [[TMP37:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK14-NEXT: [[ADD15:%.*]] = add nsw i64 [[TMP36]], [[TMP37]] -// CHECK14-NEXT: store i64 [[ADD15]], i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK14-NEXT: [[TMP38:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK14-NEXT: [[TMP39:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK14-NEXT: [[ADD16:%.*]] = add nsw i64 [[TMP38]], [[TMP39]] -// CHECK14-NEXT: store i64 [[ADD16]], i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK14-NEXT: [[TMP40:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK14-NEXT: [[TMP41:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK14-NEXT: [[CMP17:%.*]] = icmp sgt i64 [[TMP40]], [[TMP41]] -// CHECK14-NEXT: br i1 [[CMP17]], label [[COND_TRUE18:%.*]], label [[COND_FALSE19:%.*]] -// CHECK14: cond.true18: -// CHECK14-NEXT: [[TMP42:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK14-NEXT: br label [[COND_END20:%.*]] -// CHECK14: cond.false19: -// CHECK14-NEXT: [[TMP43:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK14-NEXT: br label [[COND_END20]] -// CHECK14: cond.end20: -// CHECK14-NEXT: [[COND21:%.*]] = phi i64 [ [[TMP42]], [[COND_TRUE18]] ], [ [[TMP43]], [[COND_FALSE19]] ] -// CHECK14-NEXT: store i64 [[COND21]], i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK14-NEXT: [[TMP44:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK14-NEXT: store i64 [[TMP44]], i64* [[DOTOMP_IV]], align 8 -// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK14: omp.inner.for.end: -// CHECK14-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK14: omp.loop.exit: -// CHECK14-NEXT: [[TMP45:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK14-NEXT: [[TMP46:%.*]] = load i32, i32* [[TMP45]], align 4 -// CHECK14-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP46]]) -// CHECK14-NEXT: br label [[OMP_PRECOND_END]] -// CHECK14: omp.precond.end: -// CHECK14-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59 +// CHECK2-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK2-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK2-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK2-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 8 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK2: .execute: +// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK2-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32*, i32** [[V_ADDR]], align 8 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK2-NEXT: call void @__omp_outlined__10(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i32]* [[TMP0]], i32* [[TMP4]]) #[[ATTR2]] +// CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK2: .omp.deinit: +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__10 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK2-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 8 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK2-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK2-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 8 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK2-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK2-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK2: omp.precond.then: +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK2-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK2-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK2: cond.true: +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: br label [[COND_END:%.*]] +// CHECK2: cond.false: +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END]] +// CHECK2: cond.end: +// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK2-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK2-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK2-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 +// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: [[CONV6:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK2-NEXT: store i32 [[TMP18]], i32* [[CONV6]], align 4 +// CHECK2-NEXT: [[TMP19:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK2-NEXT: [[TMP20:%.*]] = load i32*, i32** [[V_ADDR]], align 8 +// CHECK2-NEXT: [[TMP21:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK2-NEXT: [[TMP22:%.*]] = inttoptr i64 [[TMP15]] to i8* +// CHECK2-NEXT: store i8* [[TMP22]], i8** [[TMP21]], align 8 +// CHECK2-NEXT: [[TMP23:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK2-NEXT: [[TMP24:%.*]] = inttoptr i64 [[TMP17]] to i8* +// CHECK2-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 8 +// CHECK2-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK2-NEXT: [[TMP26:%.*]] = inttoptr i64 [[TMP19]] to i8* +// CHECK2-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 8 +// CHECK2-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK2-NEXT: [[TMP28:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK2-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 8 +// CHECK2-NEXT: [[TMP29:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 4 +// CHECK2-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP20]] to i8* +// CHECK2-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 8 +// CHECK2-NEXT: [[TMP31:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP32:%.*]] = load i32, i32* [[TMP31]], align 4 +// CHECK2-NEXT: [[TMP33:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP32]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i32]*, i32*)* @__omp_outlined__11 to i8*), i8* null, i8** [[TMP33]], i64 5) +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP34]], [[TMP35]] +// CHECK2-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP36]], [[TMP37]] +// CHECK2-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP38]], [[TMP39]] +// CHECK2-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP40]], [[TMP41]] +// CHECK2-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK2: cond.true11: +// CHECK2-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: br label [[COND_END13:%.*]] +// CHECK2: cond.false12: +// CHECK2-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END13]] +// CHECK2: cond.end13: +// CHECK2-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP42]], [[COND_TRUE11]] ], [ [[TMP43]], [[COND_FALSE12]] ] +// CHECK2-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP44]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK2: omp.loop.exit: +// CHECK2-NEXT: [[TMP45:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP46:%.*]] = load i32, i32* [[TMP45]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP46]]) +// CHECK2-NEXT: br label [[OMP_PRECOND_END]] +// CHECK2: omp.precond.end: +// CHECK2-NEXT: ret void // // -// CHECK14-LABEL: define {{[^@]+}}@__omp_outlined__9 -// CHECK14-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { -// CHECK14-NEXT: entry: -// CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK14-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK14-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 -// CHECK14-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 -// CHECK14-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_LB:%.*]] = alloca i64, align 8 -// CHECK14-NEXT: [[DOTOMP_UB:%.*]] = alloca i64, align 8 -// CHECK14-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 -// CHECK14-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[I11:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[J12:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK14-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK14-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK14-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK14-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK14-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK14-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK14-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK14-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK14-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK14-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK14-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 -// CHECK14-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK14-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64 -// CHECK14-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK14-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 -// CHECK14-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 -// CHECK14-NEXT: [[CONV6:%.*]] = sext i32 [[DIV5]] to i64 -// CHECK14-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV6]] -// CHECK14-NEXT: [[SUB7:%.*]] = sub nsw i64 [[MUL]], 1 -// CHECK14-NEXT: store i64 [[SUB7]], i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK14-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK14-NEXT: store i32 0, i32* [[J]], align 4 -// CHECK14-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK14-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] -// CHECK14-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK14: land.lhs.true: -// CHECK14-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK14-NEXT: [[CMP8:%.*]] = icmp slt i32 0, [[TMP6]] -// CHECK14-NEXT: br i1 [[CMP8]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] -// CHECK14: omp.precond.then: -// CHECK14-NEXT: store i64 0, i64* [[DOTOMP_LB]], align 8 -// CHECK14-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK14-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_UB]], align 8 -// CHECK14-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK14-NEXT: [[CONV9:%.*]] = zext i32 [[TMP8]] to i64 -// CHECK14-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK14-NEXT: [[CONV10:%.*]] = zext i32 [[TMP9]] to i64 -// CHECK14-NEXT: store i64 [[CONV9]], i64* [[DOTOMP_LB]], align 8 -// CHECK14-NEXT: store i64 [[CONV10]], i64* [[DOTOMP_UB]], align 8 -// CHECK14-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK14-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK14-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 -// CHECK14-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB2]], i32 [[TMP11]], i32 33, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_LB]], i64* [[DOTOMP_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 1) -// CHECK14-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTOMP_LB]], align 8 -// CHECK14-NEXT: store i64 [[TMP12]], i64* [[DOTOMP_IV]], align 8 -// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK14: omp.inner.for.cond: -// CHECK14-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK14-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK14-NEXT: [[CONV13:%.*]] = zext i32 [[TMP14]] to i64 -// CHECK14-NEXT: [[CMP14:%.*]] = icmp sle i64 [[TMP13]], [[CONV13]] -// CHECK14-NEXT: br i1 [[CMP14]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK14: omp.inner.for.body: -// CHECK14-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK14-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK14-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP16]], 0 -// CHECK14-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1 -// CHECK14-NEXT: [[MUL17:%.*]] = mul nsw i32 1, [[DIV16]] -// CHECK14-NEXT: [[CONV18:%.*]] = sext i32 [[MUL17]] to i64 -// CHECK14-NEXT: [[DIV19:%.*]] = sdiv i64 [[TMP15]], [[CONV18]] -// CHECK14-NEXT: [[MUL20:%.*]] = mul nsw i64 [[DIV19]], 1 -// CHECK14-NEXT: [[ADD:%.*]] = add nsw i64 0, [[MUL20]] -// CHECK14-NEXT: [[CONV21:%.*]] = trunc i64 [[ADD]] to i32 -// CHECK14-NEXT: store i32 [[CONV21]], i32* [[I11]], align 4 -// CHECK14-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK14-NEXT: [[TMP18:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK14-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK14-NEXT: [[SUB22:%.*]] = sub nsw i32 [[TMP19]], 0 -// CHECK14-NEXT: [[DIV23:%.*]] = sdiv i32 [[SUB22]], 1 -// CHECK14-NEXT: [[MUL24:%.*]] = mul nsw i32 1, [[DIV23]] -// CHECK14-NEXT: [[CONV25:%.*]] = sext i32 [[MUL24]] to i64 -// CHECK14-NEXT: [[DIV26:%.*]] = sdiv i64 [[TMP18]], [[CONV25]] -// CHECK14-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK14-NEXT: [[SUB27:%.*]] = sub nsw i32 [[TMP20]], 0 -// CHECK14-NEXT: [[DIV28:%.*]] = sdiv i32 [[SUB27]], 1 -// CHECK14-NEXT: [[MUL29:%.*]] = mul nsw i32 1, [[DIV28]] -// CHECK14-NEXT: [[CONV30:%.*]] = sext i32 [[MUL29]] to i64 -// CHECK14-NEXT: [[MUL31:%.*]] = mul nsw i64 [[DIV26]], [[CONV30]] -// CHECK14-NEXT: [[SUB32:%.*]] = sub nsw i64 [[TMP17]], [[MUL31]] -// CHECK14-NEXT: [[MUL33:%.*]] = mul nsw i64 [[SUB32]], 1 -// CHECK14-NEXT: [[ADD34:%.*]] = add nsw i64 0, [[MUL33]] -// CHECK14-NEXT: [[CONV35:%.*]] = trunc i64 [[ADD34]] to i32 -// CHECK14-NEXT: store i32 [[CONV35]], i32* [[J12]], align 4 -// CHECK14-NEXT: [[TMP21:%.*]] = load i32, i32* [[I11]], align 4 -// CHECK14-NEXT: [[TMP22:%.*]] = load i32, i32* [[J12]], align 4 -// CHECK14-NEXT: [[ADD36:%.*]] = add nsw i32 [[TMP21]], [[TMP22]] -// CHECK14-NEXT: [[TMP23:%.*]] = load i32, i32* [[I11]], align 4 -// CHECK14-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP23]] -// CHECK14-NEXT: [[TMP24:%.*]] = load i32, i32* [[J12]], align 4 -// CHECK14-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP24]] -// CHECK14-NEXT: store i32 [[ADD36]], i32* [[ARRAYIDX37]], align 4 -// CHECK14-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK14: omp.body.continue: -// CHECK14-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK14: omp.inner.for.inc: -// CHECK14-NEXT: [[TMP25:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK14-NEXT: [[TMP26:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK14-NEXT: [[ADD38:%.*]] = add nsw i64 [[TMP25]], [[TMP26]] -// CHECK14-NEXT: store i64 [[ADD38]], i64* [[DOTOMP_IV]], align 8 -// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK14: omp.inner.for.end: -// CHECK14-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK14: omp.loop.exit: -// CHECK14-NEXT: [[TMP27:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK14-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP27]], align 4 -// CHECK14-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP28]]) -// CHECK14-NEXT: br label [[OMP_PRECOND_END]] -// CHECK14: omp.precond.end: -// CHECK14-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__11 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK2-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK2-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK2-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 8 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK2-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK2-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK2: omp.precond.then: +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK2-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP5]] to i32 +// CHECK2-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP6]] to i32 +// CHECK2-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[CONV6:%.*]] = sext i32 [[TMP10]] to i64 +// CHECK2-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: [[CMP7:%.*]] = icmp ule i64 [[CONV6]], [[TMP11]] +// CHECK2-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK2-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 +// CHECK2-NEXT: [[TMP13:%.*]] = load i32*, i32** [[V_ADDR]], align 8 +// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK2-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP14]] to i64 +// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TMP13]], i64 [[IDXPROM]] +// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK2-NEXT: [[IDXPROM8:%.*]] = sext i32 [[TMP16]] to i64 +// CHECK2-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM8]] +// CHECK2-NEXT: store i32 [[TMP15]], i32* [[ARRAYIDX9]], align 4 +// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK2: omp.body.continue: +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK2-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK2: omp.loop.exit: +// CHECK2-NEXT: [[TMP19:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP20]]) +// CHECK2-NEXT: br label [[OMP_PRECOND_END]] +// CHECK2: omp.precond.end: +// CHECK2-NEXT: ret void // // -// CHECK14-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74 -// CHECK14-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { -// CHECK14-NEXT: entry: -// CHECK14-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK14-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 -// CHECK14-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK14-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK14-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK14-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 -// CHECK14-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK14-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK14-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK14-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK14: .execute: -// CHECK14-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK14-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 -// CHECK14-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK14-NEXT: [[TMP4:%.*]] = load i32*, i32** [[V_ADDR]], align 4 -// CHECK14-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK14-NEXT: call void @__omp_outlined__10(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32* [[TMP4]]) #[[ATTR2]] -// CHECK14-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK14: .omp.deinit: -// CHECK14-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK14-NEXT: br label [[DOTEXIT:%.*]] -// CHECK14: .exit: -// CHECK14-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l28 +// CHECK3-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK3-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14:![0-9]+]] +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK3: .execute: +// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP4]], i32* [[L_CASTED]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK3-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32 [[TMP5]]) #[[ATTR2:[0-9]+]] +// CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK3: .omp.deinit: +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK3-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I4:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK3-NEXT: [[L1:%.*]] = call i8* @__kmpc_alloc_shared(i32 4) +// CHECK3-NEXT: [[L_ON_STACK:%.*]] = bitcast i8* [[L1]] to i32* +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK3-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK3-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK3: omp.precond.then: +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK3-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK3: cond.true: +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: br label [[COND_END:%.*]] +// CHECK3: cond.false: +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END]] +// CHECK3: cond.end: +// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK3-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK3-NEXT: [[CMP6:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK3-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 +// CHECK3-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP18]], i32* [[L_CASTED]], align 4 +// CHECK3-NEXT: [[TMP19:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK3-NEXT: [[TMP20:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK3-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 +// CHECK3-NEXT: [[TMP22:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK3-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 +// CHECK3-NEXT: [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK3-NEXT: [[TMP25:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK3-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 +// CHECK3-NEXT: [[TMP26:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK3-NEXT: [[TMP27:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK3-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 4 +// CHECK3-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK3-NEXT: [[TMP29:%.*]] = inttoptr i32 [[TMP19]] to i8* +// CHECK3-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 4 +// CHECK3-NEXT: [[TMP30:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP31:%.*]] = load i32, i32* [[TMP30]], align 4 +// CHECK3-NEXT: [[TMP32:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP31]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP32]], i32 5) +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK3-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] +// CHECK3-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP37]], [[TMP38]] +// CHECK3-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP39]], [[TMP40]] +// CHECK3-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK3: cond.true11: +// CHECK3-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: br label [[COND_END13:%.*]] +// CHECK3: cond.false12: +// CHECK3-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END13]] +// CHECK3: cond.end13: +// CHECK3-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP41]], [[COND_TRUE11]] ], [ [[TMP42]], [[COND_FALSE12]] ] +// CHECK3-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP43]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK3: omp.loop.exit: +// CHECK3-NEXT: [[TMP44:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP45:%.*]] = load i32, i32* [[TMP44]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP45]]) +// CHECK3-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP47:%.*]] = icmp ne i32 [[TMP46]], 0 +// CHECK3-NEXT: br i1 [[TMP47]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK3: .omp.lastprivate.then: +// CHECK3-NEXT: [[TMP48:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP48]], i32* [[L_ADDR]], align 4 +// CHECK3-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK3: .omp.lastprivate.done: +// CHECK3-NEXT: br label [[OMP_PRECOND_END]] +// CHECK3: omp.precond.end: +// CHECK3-NEXT: call void @__kmpc_free_shared(i8* [[L1]]) +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK3-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK3-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK3-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK3: omp.precond.then: +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) +// CHECK3-NEXT: br label [[OMP_DISPATCH_COND:%.*]] +// CHECK3: omp.dispatch.cond: +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: [[CMP4:%.*]] = icmp ugt i32 [[TMP9]], [[TMP10]] +// CHECK3-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK3: cond.true: +// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: br label [[COND_END:%.*]] +// CHECK3: cond.false: +// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END]] +// CHECK3: cond.end: +// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] +// CHECK3-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] +// CHECK3-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] +// CHECK3: omp.dispatch.body: +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] +// CHECK3-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK3-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK3-NEXT: [[TMP19:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP19]] +// CHECK3-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 +// CHECK3-NEXT: [[TMP20:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK3-NEXT: store i32 [[TMP20]], i32* [[L_ADDR]], align 4 +// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK3: omp.body.continue: +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP21]], 1 +// CHECK3-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_DISPATCH_INC:%.*]] +// CHECK3: omp.dispatch.inc: +// CHECK3-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK3-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK3-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: br label [[OMP_DISPATCH_COND]] +// CHECK3: omp.dispatch.end: +// CHECK3-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) +// CHECK3-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 +// CHECK3-NEXT: br i1 [[TMP29]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK3: .omp.lastprivate.then: +// CHECK3-NEXT: [[TMP30:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP30]], i32* [[L_ADDR]], align 4 +// CHECK3-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK3: .omp.lastprivate.done: +// CHECK3-NEXT: br label [[OMP_PRECOND_END]] +// CHECK3: omp.precond.end: +// CHECK3-NEXT: ret void // // -// CHECK14-LABEL: define {{[^@]+}}@__omp_outlined__10 -// CHECK14-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { -// CHECK14-NEXT: entry: -// CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK14-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK14-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 -// CHECK14-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 -// CHECK14-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK14-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK14-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK14-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK14-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 -// CHECK14-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK14-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK14-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK14-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK14-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK14-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK14-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK14-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK14-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK14-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK14-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK14: omp.precond.then: -// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK14-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK14-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK14-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK14-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK14-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 -// CHECK14-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK14-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK14-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] -// CHECK14-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK14: cond.true: -// CHECK14-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK14-NEXT: br label [[COND_END:%.*]] -// CHECK14: cond.false: -// CHECK14-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: br label [[COND_END]] -// CHECK14: cond.end: -// CHECK14-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] -// CHECK14-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK14-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK14: omp.inner.for.cond: -// CHECK14-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK14-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 -// CHECK14-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] -// CHECK14-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK14: omp.inner.for.body: -// CHECK14-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK14-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK14-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 -// CHECK14-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK14-NEXT: [[TMP18:%.*]] = load i32*, i32** [[V_ADDR]], align 4 -// CHECK14-NEXT: [[TMP19:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK14-NEXT: [[TMP20:%.*]] = inttoptr i32 [[TMP14]] to i8* -// CHECK14-NEXT: store i8* [[TMP20]], i8** [[TMP19]], align 4 -// CHECK14-NEXT: [[TMP21:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK14-NEXT: [[TMP22:%.*]] = inttoptr i32 [[TMP15]] to i8* -// CHECK14-NEXT: store i8* [[TMP22]], i8** [[TMP21]], align 4 -// CHECK14-NEXT: [[TMP23:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK14-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP17]] to i8* -// CHECK14-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 -// CHECK14-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK14-NEXT: [[TMP26:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* -// CHECK14-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 -// CHECK14-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 -// CHECK14-NEXT: [[TMP28:%.*]] = bitcast i32* [[TMP18]] to i8* -// CHECK14-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 -// CHECK14-NEXT: [[TMP29:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK14-NEXT: [[TMP30:%.*]] = load i32, i32* [[TMP29]], align 4 -// CHECK14-NEXT: [[TMP31:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK14-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP30]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32*)* @__omp_outlined__11 to i8*), i8* null, i8** [[TMP31]], i32 5) -// CHECK14-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK14: omp.inner.for.inc: -// CHECK14-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK14-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP32]], [[TMP33]] -// CHECK14-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK14-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK14-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP34]], [[TMP35]] -// CHECK14-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK14-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK14-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP36]], [[TMP37]] -// CHECK14-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK14-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP38]], [[TMP39]] -// CHECK14-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] -// CHECK14: cond.true10: -// CHECK14-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK14-NEXT: br label [[COND_END12:%.*]] -// CHECK14: cond.false11: -// CHECK14-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: br label [[COND_END12]] -// CHECK14: cond.end12: -// CHECK14-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP40]], [[COND_TRUE10]] ], [ [[TMP41]], [[COND_FALSE11]] ] -// CHECK14-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK14-NEXT: store i32 [[TMP42]], i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK14: omp.inner.for.end: -// CHECK14-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK14: omp.loop.exit: -// CHECK14-NEXT: [[TMP43:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK14-NEXT: [[TMP44:%.*]] = load i32, i32* [[TMP43]], align 4 -// CHECK14-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP44]]) -// CHECK14-NEXT: br label [[OMP_PRECOND_END]] -// CHECK14: omp.precond.end: -// CHECK14-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34 +// CHECK3-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK3: .execute: +// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK3-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] +// CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK3: .omp.deinit: +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK3-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK3-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK3: omp.precond.then: +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK3-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK3-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK3: cond.true: +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: br label [[COND_END:%.*]] +// CHECK3: cond.false: +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END]] +// CHECK3: cond.end: +// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK3-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK3-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK3-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 +// CHECK3-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK3-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP19:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK3-NEXT: store i8* [[TMP19]], i8** [[TMP18]], align 4 +// CHECK3-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK3-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 +// CHECK3-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK3-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK3-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 +// CHECK3-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK3-NEXT: [[TMP25:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* +// CHECK3-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 +// CHECK3-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK3-NEXT: [[TMP28:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP27]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP28]], i32 4) +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP29]], [[TMP30]] +// CHECK3-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] +// CHECK3-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK3-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] +// CHECK3-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] +// CHECK3: cond.true10: +// CHECK3-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: br label [[COND_END12:%.*]] +// CHECK3: cond.false11: +// CHECK3-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END12]] +// CHECK3: cond.end12: +// CHECK3-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP37]], [[COND_TRUE10]] ], [ [[TMP38]], [[COND_FALSE11]] ] +// CHECK3-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP39]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK3: omp.loop.exit: +// CHECK3-NEXT: [[TMP40:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]]) +// CHECK3-NEXT: br label [[OMP_PRECOND_END]] +// CHECK3: omp.precond.end: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK3-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK3-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK3: omp.precond.then: +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] +// CHECK3-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK3-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i32 0, i32 [[TMP13]] +// CHECK3-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +// CHECK3-NEXT: [[CONV:%.*]] = sext i16 [[TMP14]] to i32 +// CHECK3-NEXT: [[ADD5:%.*]] = add nsw i32 [[CONV]], 1 +// CHECK3-NEXT: [[CONV6:%.*]] = trunc i32 [[ADD5]] to i16 +// CHECK3-NEXT: store i16 [[CONV6]], i16* [[ARRAYIDX]], align 2 +// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK3: omp.body.continue: +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] +// CHECK3-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK3: omp.loop.exit: +// CHECK3-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) +// CHECK3-NEXT: br label [[OMP_PRECOND_END]] +// CHECK3: omp.precond.end: +// CHECK3-NEXT: ret void // // -// CHECK14-LABEL: define {{[^@]+}}@__omp_outlined__11 -// CHECK14-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { -// CHECK14-NEXT: entry: -// CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK14-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK14-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 -// CHECK14-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK14-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK14-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK14-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK14-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK14-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK14-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 -// CHECK14-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK14-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK14-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK14-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK14-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK14-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK14-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK14-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK14-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK14-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK14-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK14: omp.precond.then: -// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK14-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK14-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK14-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK14-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK14-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 -// CHECK14-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 -// CHECK14-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK14-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK14-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK14-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK14-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK14-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK14: omp.inner.for.cond: -// CHECK14-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK14-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] -// CHECK14-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK14: omp.inner.for.body: -// CHECK14-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 -// CHECK14-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK14-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 -// CHECK14-NEXT: [[TMP13:%.*]] = load i32*, i32** [[V_ADDR]], align 4 -// CHECK14-NEXT: [[TMP14:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK14-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TMP13]], i32 [[TMP14]] -// CHECK14-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -// CHECK14-NEXT: [[TMP16:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK14-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP16]] -// CHECK14-NEXT: store i32 [[TMP15]], i32* [[ARRAYIDX5]], align 4 -// CHECK14-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK14: omp.body.continue: -// CHECK14-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK14: omp.inner.for.inc: -// CHECK14-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK14-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] -// CHECK14-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK14: omp.inner.for.end: -// CHECK14-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK14: omp.loop.exit: -// CHECK14-NEXT: [[TMP19:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK14-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4 -// CHECK14-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP20]]) -// CHECK14-NEXT: br label [[OMP_PRECOND_END]] -// CHECK14: omp.precond.end: -// CHECK14-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l39 +// CHECK3-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK3-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK3: .execute: +// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK3-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] +// CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK3: .omp.deinit: +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void // // -// CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43 -// CHECK15-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] { -// CHECK15-NEXT: entry: -// CHECK15-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK15-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK15-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK15-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK15-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 -// CHECK15-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK15-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK15-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK15-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK15: .execute: -// CHECK15-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) -// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK15-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 -// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK15-NEXT: [[TMP4:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK15-NEXT: store i32 [[TMP4]], i32* [[L_CASTED]], align 4 -// CHECK15-NEXT: [[TMP5:%.*]] = load i32, i32* [[L_CASTED]], align 4 -// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK15-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32 [[TMP5]]) #[[ATTR2:[0-9]+]] -// CHECK15-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK15: .omp.deinit: -// CHECK15-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK15-NEXT: br label [[DOTEXIT:%.*]] -// CHECK15: .exit: -// CHECK15-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK3-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 +// CHECK3-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK3: cond.true: +// CHECK3-NEXT: br label [[COND_END:%.*]] +// CHECK3: cond.false: +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END]] +// CHECK3: cond.end: +// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK3-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 +// CHECK3-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK3-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK3-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK3-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK3-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK3-NEXT: [[TMP15:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP15]], i32 3) +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP17]] +// CHECK3-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] +// CHECK3-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK3-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP22]], 9 +// CHECK3-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] +// CHECK3: cond.true5: +// CHECK3-NEXT: br label [[COND_END7:%.*]] +// CHECK3: cond.false6: +// CHECK3-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END7]] +// CHECK3: cond.end7: +// CHECK3-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP23]], [[COND_FALSE6]] ] +// CHECK3-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP24]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK3: omp.loop.exit: +// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__5 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK3-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 +// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 [[TMP9]] +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK3-NEXT: [[ADD1:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK3-NEXT: store i32 [[ADD1]], i32* [[ARRAYIDX]], align 4 +// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK3: omp.body.continue: +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] +// CHECK3-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK3: omp.loop.exit: +// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK3-NEXT: ret void // // -// CHECK15-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK15-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { -// CHECK15-NEXT: entry: -// CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK15-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK15-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[I4:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 -// CHECK15-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK15-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK15-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK15-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK15-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 -// CHECK15-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK15-NEXT: [[TMP1:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 4, i16 1) -// CHECK15-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to %struct._globalized_locals_ty* -// CHECK15-NEXT: [[L1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP2]], i32 0, i32 0 -// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK15-NEXT: store i32 [[TMP3]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK15-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK15-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP4]], 0 -// CHECK15-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK15-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK15-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK15-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK15-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK15-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] -// CHECK15-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK15: omp.precond.then: -// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK15-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK15-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK15-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK15-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK15-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP8]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) -// CHECK15-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK15-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]] -// CHECK15-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK15: cond.true: -// CHECK15-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK15-NEXT: br label [[COND_END:%.*]] -// CHECK15: cond.false: -// CHECK15-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: br label [[COND_END]] -// CHECK15: cond.end: -// CHECK15-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] -// CHECK15-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK15-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK15: omp.inner.for.cond: -// CHECK15-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK15-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP15]], 1 -// CHECK15-NEXT: [[CMP6:%.*]] = icmp slt i32 [[TMP14]], [[ADD]] -// CHECK15-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK15: omp.inner.for.body: -// CHECK15-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK15-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: [[TMP18:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK15-NEXT: store i32 [[TMP18]], i32* [[N_CASTED]], align 4 -// CHECK15-NEXT: [[TMP19:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK15-NEXT: [[TMP20:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK15-NEXT: store i32 [[TMP20]], i32* [[L_CASTED]], align 4 -// CHECK15-NEXT: [[TMP21:%.*]] = load i32, i32* [[L_CASTED]], align 4 -// CHECK15-NEXT: [[TMP22:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK15-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP16]] to i8* -// CHECK15-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 -// CHECK15-NEXT: [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK15-NEXT: [[TMP25:%.*]] = inttoptr i32 [[TMP17]] to i8* -// CHECK15-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 -// CHECK15-NEXT: [[TMP26:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK15-NEXT: [[TMP27:%.*]] = inttoptr i32 [[TMP19]] to i8* -// CHECK15-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 4 -// CHECK15-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK15-NEXT: [[TMP29:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* -// CHECK15-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 4 -// CHECK15-NEXT: [[TMP30:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 -// CHECK15-NEXT: [[TMP31:%.*]] = inttoptr i32 [[TMP21]] to i8* -// CHECK15-NEXT: store i8* [[TMP31]], i8** [[TMP30]], align 4 -// CHECK15-NEXT: [[TMP32:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK15-NEXT: [[TMP33:%.*]] = load i32, i32* [[TMP32]], align 4 -// CHECK15-NEXT: [[TMP34:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK15-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP33]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP34]], i32 5) -// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK15: omp.inner.for.inc: -// CHECK15-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK15-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] -// CHECK15-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK15-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK15-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP37]], [[TMP38]] -// CHECK15-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK15-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK15-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP39]], [[TMP40]] -// CHECK15-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK15-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]] -// CHECK15-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] -// CHECK15: cond.true11: -// CHECK15-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK15-NEXT: br label [[COND_END13:%.*]] -// CHECK15: cond.false12: -// CHECK15-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: br label [[COND_END13]] -// CHECK15: cond.end13: -// CHECK15-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP43]], [[COND_TRUE11]] ], [ [[TMP44]], [[COND_FALSE12]] ] -// CHECK15-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK15-NEXT: store i32 [[TMP45]], i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK15: omp.inner.for.end: -// CHECK15-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK15: omp.loop.exit: -// CHECK15-NEXT: [[TMP46:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK15-NEXT: [[TMP47:%.*]] = load i32, i32* [[TMP46]], align 4 -// CHECK15-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP47]]) -// CHECK15-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK15-NEXT: [[TMP49:%.*]] = icmp ne i32 [[TMP48]], 0 -// CHECK15-NEXT: br i1 [[TMP49]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK15: .omp.lastprivate.then: -// CHECK15-NEXT: [[TMP50:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK15-NEXT: store i32 [[TMP50]], i32* [[L_ADDR]], align 4 -// CHECK15-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK15: .omp.lastprivate.done: -// CHECK15-NEXT: br label [[OMP_PRECOND_END]] -// CHECK15: omp.precond.end: -// CHECK15-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP1]]) -// CHECK15-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44 +// CHECK3-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK3-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK3-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK3: .execute: +// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP2]], i32* [[F_CASTED]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK3-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i32 [[TMP3]]) #[[ATTR2]] +// CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK3: .omp.deinit: +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__6 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK3-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK3-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 +// CHECK3-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK3: cond.true: +// CHECK3-NEXT: br label [[COND_END:%.*]] +// CHECK3: cond.false: +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END]] +// CHECK3: cond.end: +// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK3-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 +// CHECK3-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP9]], i32* [[F_CASTED]], align 4 +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK3-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP14:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK3-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK3-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK3-NEXT: [[TMP16:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK3-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 4 +// CHECK3-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK3-NEXT: [[TMP18:%.*]] = inttoptr i32 [[TMP10]] to i8* +// CHECK3-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 4 +// CHECK3-NEXT: [[TMP19:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x [10 x i32]]*, i32)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP19]], i32 4) +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK3-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK3-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK3-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP26]], 99 +// CHECK3-NEXT: br i1 [[CMP5]], label [[COND_TRUE6:%.*]], label [[COND_FALSE7:%.*]] +// CHECK3: cond.true6: +// CHECK3-NEXT: br label [[COND_END8:%.*]] +// CHECK3: cond.false7: +// CHECK3-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END8]] +// CHECK3: cond.end8: +// CHECK3-NEXT: [[COND9:%.*]] = phi i32 [ 99, [[COND_TRUE6]] ], [ [[TMP27]], [[COND_FALSE7]] ] +// CHECK3-NEXT: store i32 [[COND9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP28]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK3: omp.loop.exit: +// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__7 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK3-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 +// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK3-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[DIV2:%.*]] = sdiv i32 [[TMP10]], 10 +// CHECK3-NEXT: [[MUL3:%.*]] = mul nsw i32 [[DIV2]], 10 +// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL3]] +// CHECK3-NEXT: [[MUL4:%.*]] = mul nsw i32 [[SUB]], 1 +// CHECK3-NEXT: [[ADD5:%.*]] = add nsw i32 0, [[MUL4]] +// CHECK3-NEXT: store i32 [[ADD5]], i32* [[J]], align 4 +// CHECK3-NEXT: store i32 10, i32* [[K]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 +// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 +// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK3-NEXT: [[MUL6:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] +// CHECK3-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP11]], [[MUL6]] +// CHECK3-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 +// CHECK3-NEXT: [[ADD8:%.*]] = add nsw i32 [[ADD7]], [[TMP14]] +// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 +// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP15]] +// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 +// CHECK3-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP16]] +// CHECK3-NEXT: store i32 [[ADD8]], i32* [[ARRAYIDX9]], align 4 +// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK3: omp.body.continue: +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK3-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK3: omp.loop.exit: +// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK3-NEXT: ret void // // -// CHECK15-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK15-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { -// CHECK15-NEXT: entry: -// CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK15-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK15-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK15-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK15-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK15-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK15-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK15-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK15-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 -// CHECK15-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK15-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK15-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK15-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK15-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK15-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK15-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK15-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK15-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK15: omp.precond.then: -// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK15-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK15-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK15-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK15-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK15-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 -// CHECK15-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 -// CHECK15-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK15-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK15-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK15-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) -// CHECK15-NEXT: br label [[OMP_DISPATCH_COND:%.*]] -// CHECK15: omp.dispatch.cond: -// CHECK15-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK15-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK15-NEXT: [[CMP4:%.*]] = icmp ugt i32 [[TMP9]], [[TMP10]] -// CHECK15-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK15: cond.true: -// CHECK15-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK15-NEXT: br label [[COND_END:%.*]] -// CHECK15: cond.false: -// CHECK15-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK15-NEXT: br label [[COND_END]] -// CHECK15: cond.end: -// CHECK15-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] -// CHECK15-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 -// CHECK15-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK15-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK15-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] -// CHECK15-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] -// CHECK15: omp.dispatch.body: -// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK15: omp.inner.for.cond: -// CHECK15-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK15-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] -// CHECK15-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK15: omp.inner.for.body: -// CHECK15-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 -// CHECK15-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK15-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 -// CHECK15-NEXT: [[TMP19:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK15-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP19]] -// CHECK15-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 -// CHECK15-NEXT: [[TMP20:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK15-NEXT: store i32 [[TMP20]], i32* [[L_ADDR]], align 4 -// CHECK15-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK15: omp.body.continue: -// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK15: omp.inner.for.inc: -// CHECK15-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP21]], 1 -// CHECK15-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK15: omp.inner.for.end: -// CHECK15-NEXT: br label [[OMP_DISPATCH_INC:%.*]] -// CHECK15: omp.dispatch.inc: -// CHECK15-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK15-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK15-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK15-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_LB]], align 4 -// CHECK15-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK15-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK15-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] -// CHECK15-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_UB]], align 4 -// CHECK15-NEXT: br label [[OMP_DISPATCH_COND]] -// CHECK15: omp.dispatch.end: -// CHECK15-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK15-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 -// CHECK15-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) -// CHECK15-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK15-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 -// CHECK15-NEXT: br i1 [[TMP29]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK15: .omp.lastprivate.then: -// CHECK15-NEXT: [[TMP30:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK15-NEXT: store i32 [[TMP30]], i32* [[L_ADDR]], align 4 -// CHECK15-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK15: .omp.lastprivate.done: -// CHECK15-NEXT: br label [[OMP_PRECOND_END]] -// CHECK15: omp.precond.end: -// CHECK15-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l52 +// CHECK3-SAME: (i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK3: .execute: +// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK3-NEXT: call void @__omp_outlined__8(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [10 x [10 x i32]]* [[TMP0]]) #[[ATTR2]] +// CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK3: .omp.deinit: +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__8 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 +// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i64, align 8 +// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i64, align 8 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I9:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[J10:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 +// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK3-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK3-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 +// CHECK3-NEXT: [[CONV6:%.*]] = sext i32 [[DIV5]] to i64 +// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV6]] +// CHECK3-NEXT: [[SUB7:%.*]] = sub nsw i64 [[MUL]], 1 +// CHECK3-NEXT: store i64 [[SUB7]], i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK3-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[J]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK3-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK3: land.lhs.true: +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: [[CMP8:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK3-NEXT: br i1 [[CMP8]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] +// CHECK3: omp.precond.then: +// CHECK3-NEXT: store i64 0, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK3-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK3-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK3-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK3-NEXT: [[CONV11:%.*]] = zext i32 [[NVPTX_NUM_THREADS]] to i64 +// CHECK3-NEXT: [[TMP8:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB1]], i32 [[TMP9]], i32 91, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_COMB_LB]], i64* [[DOTOMP_COMB_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 [[CONV11]]) +// CHECK3-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK3-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK3-NEXT: [[CMP12:%.*]] = icmp sgt i64 [[TMP10]], [[TMP11]] +// CHECK3-NEXT: br i1 [[CMP12]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK3: cond.true: +// CHECK3-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK3-NEXT: br label [[COND_END:%.*]] +// CHECK3: cond.false: +// CHECK3-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK3-NEXT: br label [[COND_END]] +// CHECK3: cond.end: +// CHECK3-NEXT: [[COND:%.*]] = phi i64 [ [[TMP12]], [[COND_TRUE]] ], [ [[TMP13]], [[COND_FALSE]] ] +// CHECK3-NEXT: store i64 [[COND]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK3-NEXT: [[TMP14:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK3-NEXT: store i64 [[TMP14]], i64* [[DOTOMP_IV]], align 8 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK3-NEXT: [[TMP16:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i64 [[TMP16]], 1 +// CHECK3-NEXT: [[CMP13:%.*]] = icmp slt i64 [[TMP15]], [[ADD]] +// CHECK3-NEXT: br i1 [[CMP13]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK3-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32 +// CHECK3-NEXT: [[TMP19:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK3-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32 +// CHECK3-NEXT: [[TMP21:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP21]], i32* [[N_CASTED]], align 4 +// CHECK3-NEXT: [[TMP22:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK3-NEXT: [[TMP23:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP18]] to i8* +// CHECK3-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 +// CHECK3-NEXT: [[TMP25:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP26:%.*]] = inttoptr i32 [[TMP20]] to i8* +// CHECK3-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 +// CHECK3-NEXT: [[TMP27:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK3-NEXT: [[TMP28:%.*]] = inttoptr i32 [[TMP22]] to i8* +// CHECK3-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 +// CHECK3-NEXT: [[TMP29:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK3-NEXT: [[TMP30:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK3-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 4 +// CHECK3-NEXT: [[TMP31:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP32:%.*]] = load i32, i32* [[TMP31]], align 4 +// CHECK3-NEXT: [[TMP33:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP32]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [10 x [10 x i32]]*)* @__omp_outlined__9 to i8*), i8* null, i8** [[TMP33]], i32 4) +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP34:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK3-NEXT: [[TMP35:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK3-NEXT: [[ADD14:%.*]] = add nsw i64 [[TMP34]], [[TMP35]] +// CHECK3-NEXT: store i64 [[ADD14]], i64* [[DOTOMP_IV]], align 8 +// CHECK3-NEXT: [[TMP36:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK3-NEXT: [[TMP37:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK3-NEXT: [[ADD15:%.*]] = add nsw i64 [[TMP36]], [[TMP37]] +// CHECK3-NEXT: store i64 [[ADD15]], i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK3-NEXT: [[TMP38:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK3-NEXT: [[TMP39:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK3-NEXT: [[ADD16:%.*]] = add nsw i64 [[TMP38]], [[TMP39]] +// CHECK3-NEXT: store i64 [[ADD16]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK3-NEXT: [[TMP40:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK3-NEXT: [[TMP41:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK3-NEXT: [[CMP17:%.*]] = icmp sgt i64 [[TMP40]], [[TMP41]] +// CHECK3-NEXT: br i1 [[CMP17]], label [[COND_TRUE18:%.*]], label [[COND_FALSE19:%.*]] +// CHECK3: cond.true18: +// CHECK3-NEXT: [[TMP42:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK3-NEXT: br label [[COND_END20:%.*]] +// CHECK3: cond.false19: +// CHECK3-NEXT: [[TMP43:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK3-NEXT: br label [[COND_END20]] +// CHECK3: cond.end20: +// CHECK3-NEXT: [[COND21:%.*]] = phi i64 [ [[TMP42]], [[COND_TRUE18]] ], [ [[TMP43]], [[COND_FALSE19]] ] +// CHECK3-NEXT: store i64 [[COND21]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK3-NEXT: [[TMP44:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK3-NEXT: store i64 [[TMP44]], i64* [[DOTOMP_IV]], align 8 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK3: omp.loop.exit: +// CHECK3-NEXT: [[TMP45:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP46:%.*]] = load i32, i32* [[TMP45]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP46]]) +// CHECK3-NEXT: br label [[OMP_PRECOND_END]] +// CHECK3: omp.precond.end: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__9 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 +// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i64, align 8 +// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i64, align 8 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I11:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[J12:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 +// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK3-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK3-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 +// CHECK3-NEXT: [[CONV6:%.*]] = sext i32 [[DIV5]] to i64 +// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV6]] +// CHECK3-NEXT: [[SUB7:%.*]] = sub nsw i64 [[MUL]], 1 +// CHECK3-NEXT: store i64 [[SUB7]], i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK3-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[J]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK3-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK3: land.lhs.true: +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: [[CMP8:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK3-NEXT: br i1 [[CMP8]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] +// CHECK3: omp.precond.then: +// CHECK3-NEXT: store i64 0, i64* [[DOTOMP_LB]], align 8 +// CHECK3-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK3-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_UB]], align 8 +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK3-NEXT: [[CONV9:%.*]] = zext i32 [[TMP8]] to i64 +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: [[CONV10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK3-NEXT: store i64 [[CONV9]], i64* [[DOTOMP_LB]], align 8 +// CHECK3-NEXT: store i64 [[CONV10]], i64* [[DOTOMP_UB]], align 8 +// CHECK3-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB2]], i32 [[TMP11]], i32 33, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_LB]], i64* [[DOTOMP_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 1) +// CHECK3-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTOMP_LB]], align 8 +// CHECK3-NEXT: store i64 [[TMP12]], i64* [[DOTOMP_IV]], align 8 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK3-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: [[CONV13:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK3-NEXT: [[CMP14:%.*]] = icmp sle i64 [[TMP13]], [[CONV13]] +// CHECK3-NEXT: br i1 [[CMP14]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP16]], 0 +// CHECK3-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1 +// CHECK3-NEXT: [[MUL17:%.*]] = mul nsw i32 1, [[DIV16]] +// CHECK3-NEXT: [[CONV18:%.*]] = sext i32 [[MUL17]] to i64 +// CHECK3-NEXT: [[DIV19:%.*]] = sdiv i64 [[TMP15]], [[CONV18]] +// CHECK3-NEXT: [[MUL20:%.*]] = mul nsw i64 [[DIV19]], 1 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i64 0, [[MUL20]] +// CHECK3-NEXT: [[CONV21:%.*]] = trunc i64 [[ADD]] to i32 +// CHECK3-NEXT: store i32 [[CONV21]], i32* [[I11]], align 4 +// CHECK3-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK3-NEXT: [[TMP18:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK3-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: [[SUB22:%.*]] = sub nsw i32 [[TMP19]], 0 +// CHECK3-NEXT: [[DIV23:%.*]] = sdiv i32 [[SUB22]], 1 +// CHECK3-NEXT: [[MUL24:%.*]] = mul nsw i32 1, [[DIV23]] +// CHECK3-NEXT: [[CONV25:%.*]] = sext i32 [[MUL24]] to i64 +// CHECK3-NEXT: [[DIV26:%.*]] = sdiv i64 [[TMP18]], [[CONV25]] +// CHECK3-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: [[SUB27:%.*]] = sub nsw i32 [[TMP20]], 0 +// CHECK3-NEXT: [[DIV28:%.*]] = sdiv i32 [[SUB27]], 1 +// CHECK3-NEXT: [[MUL29:%.*]] = mul nsw i32 1, [[DIV28]] +// CHECK3-NEXT: [[CONV30:%.*]] = sext i32 [[MUL29]] to i64 +// CHECK3-NEXT: [[MUL31:%.*]] = mul nsw i64 [[DIV26]], [[CONV30]] +// CHECK3-NEXT: [[SUB32:%.*]] = sub nsw i64 [[TMP17]], [[MUL31]] +// CHECK3-NEXT: [[MUL33:%.*]] = mul nsw i64 [[SUB32]], 1 +// CHECK3-NEXT: [[ADD34:%.*]] = add nsw i64 0, [[MUL33]] +// CHECK3-NEXT: [[CONV35:%.*]] = trunc i64 [[ADD34]] to i32 +// CHECK3-NEXT: store i32 [[CONV35]], i32* [[J12]], align 4 +// CHECK3-NEXT: [[TMP21:%.*]] = load i32, i32* [[I11]], align 4 +// CHECK3-NEXT: [[TMP22:%.*]] = load i32, i32* [[J12]], align 4 +// CHECK3-NEXT: [[ADD36:%.*]] = add nsw i32 [[TMP21]], [[TMP22]] +// CHECK3-NEXT: [[TMP23:%.*]] = load i32, i32* [[I11]], align 4 +// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP23]] +// CHECK3-NEXT: [[TMP24:%.*]] = load i32, i32* [[J12]], align 4 +// CHECK3-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP24]] +// CHECK3-NEXT: store i32 [[ADD36]], i32* [[ARRAYIDX37]], align 4 +// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK3: omp.body.continue: +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP25:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK3-NEXT: [[TMP26:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK3-NEXT: [[ADD38:%.*]] = add nsw i64 [[TMP25]], [[TMP26]] +// CHECK3-NEXT: store i64 [[ADD38]], i64* [[DOTOMP_IV]], align 8 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK3: omp.loop.exit: +// CHECK3-NEXT: [[TMP27:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP27]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP28]]) +// CHECK3-NEXT: br label [[OMP_PRECOND_END]] +// CHECK3: omp.precond.end: +// CHECK3-NEXT: ret void // // -// CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 -// CHECK15-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK15-NEXT: entry: -// CHECK15-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 -// CHECK15-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK15-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK15-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK15-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK15-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK15-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK15-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK15: .execute: -// CHECK15-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK15-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 -// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK15-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] -// CHECK15-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK15: .omp.deinit: -// CHECK15-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK15-NEXT: br label [[DOTEXIT:%.*]] -// CHECK15: .exit: -// CHECK15-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59 +// CHECK3-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK3-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK3-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK3: .execute: +// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32*, i32** [[V_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK3-NEXT: call void @__omp_outlined__10(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32* [[TMP4]]) #[[ATTR2]] +// CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK3: .omp.deinit: +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void // // -// CHECK15-LABEL: define {{[^@]+}}@__omp_outlined__2 -// CHECK15-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK15-NEXT: entry: -// CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK15-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 -// CHECK15-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 -// CHECK15-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK15-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK15-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK15-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK15-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK15-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK15-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK15-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK15-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK15-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK15-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK15-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK15-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK15: omp.precond.then: -// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK15-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK15-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK15-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK15-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK15-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 -// CHECK15-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK15-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK15-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] -// CHECK15-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK15: cond.true: -// CHECK15-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK15-NEXT: br label [[COND_END:%.*]] -// CHECK15: cond.false: -// CHECK15-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: br label [[COND_END]] -// CHECK15: cond.end: -// CHECK15-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] -// CHECK15-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK15-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK15: omp.inner.for.cond: -// CHECK15-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK15-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 -// CHECK15-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] -// CHECK15-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK15: omp.inner.for.body: -// CHECK15-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK15-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK15-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 -// CHECK15-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK15-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK15-NEXT: [[TMP19:%.*]] = inttoptr i32 [[TMP14]] to i8* -// CHECK15-NEXT: store i8* [[TMP19]], i8** [[TMP18]], align 4 -// CHECK15-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK15-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP15]] to i8* -// CHECK15-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 -// CHECK15-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK15-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP17]] to i8* -// CHECK15-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 -// CHECK15-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK15-NEXT: [[TMP25:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* -// CHECK15-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 -// CHECK15-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK15-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 -// CHECK15-NEXT: [[TMP28:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK15-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP27]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP28]], i32 4) -// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK15: omp.inner.for.inc: -// CHECK15-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK15-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP29]], [[TMP30]] -// CHECK15-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK15-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK15-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] -// CHECK15-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK15-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK15-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] -// CHECK15-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK15-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] -// CHECK15-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] -// CHECK15: cond.true10: -// CHECK15-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK15-NEXT: br label [[COND_END12:%.*]] -// CHECK15: cond.false11: -// CHECK15-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: br label [[COND_END12]] -// CHECK15: cond.end12: -// CHECK15-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP37]], [[COND_TRUE10]] ], [ [[TMP38]], [[COND_FALSE11]] ] -// CHECK15-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK15-NEXT: store i32 [[TMP39]], i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK15: omp.inner.for.end: -// CHECK15-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK15: omp.loop.exit: -// CHECK15-NEXT: [[TMP40:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK15-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 -// CHECK15-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]]) -// CHECK15-NEXT: br label [[OMP_PRECOND_END]] -// CHECK15: omp.precond.end: -// CHECK15-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__10 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK3-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK3-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK3-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK3-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK3: omp.precond.then: +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK3-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK3-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK3: cond.true: +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: br label [[COND_END:%.*]] +// CHECK3: cond.false: +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END]] +// CHECK3: cond.end: +// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK3-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK3-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK3-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 +// CHECK3-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK3-NEXT: [[TMP18:%.*]] = load i32*, i32** [[V_ADDR]], align 4 +// CHECK3-NEXT: [[TMP19:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP20:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK3-NEXT: store i8* [[TMP20]], i8** [[TMP19]], align 4 +// CHECK3-NEXT: [[TMP21:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP22:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK3-NEXT: store i8* [[TMP22]], i8** [[TMP21]], align 4 +// CHECK3-NEXT: [[TMP23:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK3-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK3-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 +// CHECK3-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK3-NEXT: [[TMP26:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK3-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 +// CHECK3-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK3-NEXT: [[TMP28:%.*]] = bitcast i32* [[TMP18]] to i8* +// CHECK3-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 +// CHECK3-NEXT: [[TMP29:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP30:%.*]] = load i32, i32* [[TMP29]], align 4 +// CHECK3-NEXT: [[TMP31:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP30]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32*)* @__omp_outlined__11 to i8*), i8* null, i8** [[TMP31]], i32 5) +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP32]], [[TMP33]] +// CHECK3-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP34]], [[TMP35]] +// CHECK3-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP36]], [[TMP37]] +// CHECK3-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP38]], [[TMP39]] +// CHECK3-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] +// CHECK3: cond.true10: +// CHECK3-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: br label [[COND_END12:%.*]] +// CHECK3: cond.false11: +// CHECK3-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END12]] +// CHECK3: cond.end12: +// CHECK3-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP40]], [[COND_TRUE10]] ], [ [[TMP41]], [[COND_FALSE11]] ] +// CHECK3-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP42]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK3: omp.loop.exit: +// CHECK3-NEXT: [[TMP43:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP44:%.*]] = load i32, i32* [[TMP43]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP44]]) +// CHECK3-NEXT: br label [[OMP_PRECOND_END]] +// CHECK3: omp.precond.end: +// CHECK3-NEXT: ret void // // -// CHECK15-LABEL: define {{[^@]+}}@__omp_outlined__3 -// CHECK15-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK15-NEXT: entry: -// CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK15-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 -// CHECK15-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK15-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK15-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK15-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK15-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK15-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK15-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK15-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK15-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK15-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK15-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK15-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK15-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK15-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK15-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK15: omp.precond.then: -// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK15-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK15-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK15-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK15-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK15-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 -// CHECK15-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 -// CHECK15-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK15-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK15-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK15-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK15-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK15-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK15: omp.inner.for.cond: -// CHECK15-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK15-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] -// CHECK15-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK15: omp.inner.for.body: -// CHECK15-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 -// CHECK15-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK15-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 -// CHECK15-NEXT: [[TMP13:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK15-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i32 0, i32 [[TMP13]] -// CHECK15-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 -// CHECK15-NEXT: [[CONV:%.*]] = sext i16 [[TMP14]] to i32 -// CHECK15-NEXT: [[ADD5:%.*]] = add nsw i32 [[CONV]], 1 -// CHECK15-NEXT: [[CONV6:%.*]] = trunc i32 [[ADD5]] to i16 -// CHECK15-NEXT: store i16 [[CONV6]], i16* [[ARRAYIDX]], align 2 -// CHECK15-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK15: omp.body.continue: -// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK15: omp.inner.for.inc: -// CHECK15-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK15-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] -// CHECK15-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK15: omp.inner.for.end: -// CHECK15-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK15: omp.loop.exit: -// CHECK15-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK15-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 -// CHECK15-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) -// CHECK15-NEXT: br label [[OMP_PRECOND_END]] -// CHECK15: omp.precond.end: -// CHECK15-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__11 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK3-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK3-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK3-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK3-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK3: omp.precond.then: +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] +// CHECK3-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK3-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK3-NEXT: [[TMP13:%.*]] = load i32*, i32** [[V_ADDR]], align 4 +// CHECK3-NEXT: [[TMP14:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TMP13]], i32 [[TMP14]] +// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK3-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP16]] +// CHECK3-NEXT: store i32 [[TMP15]], i32* [[ARRAYIDX5]], align 4 +// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK3: omp.body.continue: +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK3-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK3: omp.loop.exit: +// CHECK3-NEXT: [[TMP19:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP20]]) +// CHECK3-NEXT: br label [[OMP_PRECOND_END]] +// CHECK3: omp.precond.end: +// CHECK3-NEXT: ret void // // -// CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 -// CHECK15-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK15-NEXT: entry: -// CHECK15-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK15-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK15-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK15-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK15-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK15-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK15-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK15: .execute: -// CHECK15-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK15-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] -// CHECK15-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK15: .omp.deinit: -// CHECK15-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK15-NEXT: br label [[DOTEXIT:%.*]] -// CHECK15: .exit: -// CHECK15-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l28 +// CHECK4-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK4-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14:![0-9]+]] +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK4: .execute: +// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP4]], i32* [[L_CASTED]], align 4 +// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK4-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32 [[TMP5]]) #[[ATTR2:[0-9]+]] +// CHECK4-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK4: .omp.deinit: +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK4-NEXT: br label [[DOTEXIT:%.*]] +// CHECK4: .exit: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK4-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I4:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK4-NEXT: [[L1:%.*]] = call i8* @__kmpc_alloc_shared(i32 4) +// CHECK4-NEXT: [[L_ON_STACK:%.*]] = bitcast i8* [[L1]] to i32* +// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK4-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK4-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK4: omp.precond.then: +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) +// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK4-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK4: cond.true: +// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: br label [[COND_END:%.*]] +// CHECK4: cond.false: +// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: br label [[COND_END]] +// CHECK4: cond.end: +// CHECK4-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK4-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK4: omp.inner.for.cond: +// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK4-NEXT: [[CMP6:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK4-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK4: omp.inner.for.body: +// CHECK4-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 +// CHECK4-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK4-NEXT: [[TMP18:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP18]], i32* [[L_CASTED]], align 4 +// CHECK4-NEXT: [[TMP19:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK4-NEXT: [[TMP20:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK4-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 +// CHECK4-NEXT: [[TMP22:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK4-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 +// CHECK4-NEXT: [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK4-NEXT: [[TMP25:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK4-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 +// CHECK4-NEXT: [[TMP26:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK4-NEXT: [[TMP27:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK4-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 4 +// CHECK4-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK4-NEXT: [[TMP29:%.*]] = inttoptr i32 [[TMP19]] to i8* +// CHECK4-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 4 +// CHECK4-NEXT: [[TMP30:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP31:%.*]] = load i32, i32* [[TMP30]], align 4 +// CHECK4-NEXT: [[TMP32:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP31]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP32]], i32 5) +// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK4: omp.inner.for.inc: +// CHECK4-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK4-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] +// CHECK4-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP37]], [[TMP38]] +// CHECK4-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP39]], [[TMP40]] +// CHECK4-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK4: cond.true11: +// CHECK4-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: br label [[COND_END13:%.*]] +// CHECK4: cond.false12: +// CHECK4-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: br label [[COND_END13]] +// CHECK4: cond.end13: +// CHECK4-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP41]], [[COND_TRUE11]] ], [ [[TMP42]], [[COND_FALSE12]] ] +// CHECK4-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP43]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK4: omp.inner.for.end: +// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK4: omp.loop.exit: +// CHECK4-NEXT: [[TMP44:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP45:%.*]] = load i32, i32* [[TMP44]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP45]]) +// CHECK4-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[TMP47:%.*]] = icmp ne i32 [[TMP46]], 0 +// CHECK4-NEXT: br i1 [[TMP47]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK4: .omp.lastprivate.then: +// CHECK4-NEXT: [[TMP48:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP48]], i32* [[L_ADDR]], align 4 +// CHECK4-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK4: .omp.lastprivate.done: +// CHECK4-NEXT: br label [[OMP_PRECOND_END]] +// CHECK4: omp.precond.end: +// CHECK4-NEXT: call void @__kmpc_free_shared(i8* [[L1]]) +// CHECK4-NEXT: ret void // // -// CHECK15-LABEL: define {{[^@]+}}@__omp_outlined__4 -// CHECK15-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK15-NEXT: entry: -// CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK15-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK15-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 -// CHECK15-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK15-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK15-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK15-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK15-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK15-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK15-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 -// CHECK15-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 -// CHECK15-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK15: cond.true: -// CHECK15-NEXT: br label [[COND_END:%.*]] -// CHECK15: cond.false: -// CHECK15-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: br label [[COND_END]] -// CHECK15: cond.end: -// CHECK15-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] -// CHECK15-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK15-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK15: omp.inner.for.cond: -// CHECK15-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 -// CHECK15-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK15: omp.inner.for.body: -// CHECK15-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK15-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK15-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP7]] to i8* -// CHECK15-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 -// CHECK15-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK15-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP8]] to i8* -// CHECK15-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 -// CHECK15-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK15-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* -// CHECK15-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 -// CHECK15-NEXT: [[TMP15:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK15-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP15]], i32 3) -// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK15: omp.inner.for.inc: -// CHECK15-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK15-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP17]] -// CHECK15-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK15-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK15-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] -// CHECK15-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK15-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK15-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] -// CHECK15-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP22]], 9 -// CHECK15-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] -// CHECK15: cond.true5: -// CHECK15-NEXT: br label [[COND_END7:%.*]] -// CHECK15: cond.false6: -// CHECK15-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: br label [[COND_END7]] -// CHECK15: cond.end7: -// CHECK15-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP23]], [[COND_FALSE6]] ] -// CHECK15-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK15-NEXT: store i32 [[TMP24]], i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK15: omp.inner.for.end: -// CHECK15-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK15: omp.loop.exit: -// CHECK15-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) -// CHECK15-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK4-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK4-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK4-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK4: omp.precond.then: +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) +// CHECK4-NEXT: br label [[OMP_DISPATCH_COND:%.*]] +// CHECK4: omp.dispatch.cond: +// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: [[CMP4:%.*]] = icmp ugt i32 [[TMP9]], [[TMP10]] +// CHECK4-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK4: cond.true: +// CHECK4-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: br label [[COND_END:%.*]] +// CHECK4: cond.false: +// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: br label [[COND_END]] +// CHECK4: cond.end: +// CHECK4-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] +// CHECK4-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] +// CHECK4-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] +// CHECK4: omp.dispatch.body: +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK4: omp.inner.for.cond: +// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] +// CHECK4-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK4: omp.inner.for.body: +// CHECK4-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK4-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK4-NEXT: [[TMP19:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP19]] +// CHECK4-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 +// CHECK4-NEXT: [[TMP20:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK4-NEXT: store i32 [[TMP20]], i32* [[L_ADDR]], align 4 +// CHECK4-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK4: omp.body.continue: +// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK4: omp.inner.for.inc: +// CHECK4-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP21]], 1 +// CHECK4-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK4: omp.inner.for.end: +// CHECK4-NEXT: br label [[OMP_DISPATCH_INC:%.*]] +// CHECK4: omp.dispatch.inc: +// CHECK4-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK4-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK4-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: br label [[OMP_DISPATCH_COND]] +// CHECK4: omp.dispatch.end: +// CHECK4-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) +// CHECK4-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 +// CHECK4-NEXT: br i1 [[TMP29]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK4: .omp.lastprivate.then: +// CHECK4-NEXT: [[TMP30:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP30]], i32* [[L_ADDR]], align 4 +// CHECK4-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK4: .omp.lastprivate.done: +// CHECK4-NEXT: br label [[OMP_PRECOND_END]] +// CHECK4: omp.precond.end: +// CHECK4-NEXT: ret void // // -// CHECK15-LABEL: define {{[^@]+}}@__omp_outlined__5 -// CHECK15-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK15-NEXT: entry: -// CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK15-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK15-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK15-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK15-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK15-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK15-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK15-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK15-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 -// CHECK15-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 -// CHECK15-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 -// CHECK15-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK15-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK15-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 -// CHECK15-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK15-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK15-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK15: omp.inner.for.cond: -// CHECK15-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK15-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] -// CHECK15-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK15: omp.inner.for.body: -// CHECK15-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 -// CHECK15-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK15-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK15-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 -// CHECK15-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 [[TMP9]] -// CHECK15-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -// CHECK15-NEXT: [[ADD1:%.*]] = add nsw i32 [[TMP10]], 1 -// CHECK15-NEXT: store i32 [[ADD1]], i32* [[ARRAYIDX]], align 4 -// CHECK15-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK15: omp.body.continue: -// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK15: omp.inner.for.inc: -// CHECK15-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK15-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] -// CHECK15-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK15: omp.inner.for.end: -// CHECK15-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK15: omp.loop.exit: -// CHECK15-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) -// CHECK15-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34 +// CHECK4-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK4-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK4: .execute: +// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK4-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] +// CHECK4-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK4: .omp.deinit: +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK4-NEXT: br label [[DOTEXIT:%.*]] +// CHECK4: .exit: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK4-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK4-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK4: omp.precond.then: +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK4-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK4-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK4: cond.true: +// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: br label [[COND_END:%.*]] +// CHECK4: cond.false: +// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: br label [[COND_END]] +// CHECK4: cond.end: +// CHECK4-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK4-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK4: omp.inner.for.cond: +// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK4-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK4-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK4: omp.inner.for.body: +// CHECK4-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 +// CHECK4-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK4-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP19:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK4-NEXT: store i8* [[TMP19]], i8** [[TMP18]], align 4 +// CHECK4-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK4-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 +// CHECK4-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK4-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK4-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 +// CHECK4-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK4-NEXT: [[TMP25:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* +// CHECK4-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 +// CHECK4-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK4-NEXT: [[TMP28:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP27]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP28]], i32 4) +// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK4: omp.inner.for.inc: +// CHECK4-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP29]], [[TMP30]] +// CHECK4-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] +// CHECK4-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK4-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] +// CHECK4-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] +// CHECK4: cond.true10: +// CHECK4-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: br label [[COND_END12:%.*]] +// CHECK4: cond.false11: +// CHECK4-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: br label [[COND_END12]] +// CHECK4: cond.end12: +// CHECK4-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP37]], [[COND_TRUE10]] ], [ [[TMP38]], [[COND_FALSE11]] ] +// CHECK4-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP39]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK4: omp.inner.for.end: +// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK4: omp.loop.exit: +// CHECK4-NEXT: [[TMP40:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]]) +// CHECK4-NEXT: br label [[OMP_PRECOND_END]] +// CHECK4: omp.precond.end: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK4-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK4-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK4: omp.precond.then: +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK4: omp.inner.for.cond: +// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] +// CHECK4-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK4: omp.inner.for.body: +// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK4-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK4-NEXT: [[TMP13:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i32 0, i32 [[TMP13]] +// CHECK4-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +// CHECK4-NEXT: [[CONV:%.*]] = sext i16 [[TMP14]] to i32 +// CHECK4-NEXT: [[ADD5:%.*]] = add nsw i32 [[CONV]], 1 +// CHECK4-NEXT: [[CONV6:%.*]] = trunc i32 [[ADD5]] to i16 +// CHECK4-NEXT: store i16 [[CONV6]], i16* [[ARRAYIDX]], align 2 +// CHECK4-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK4: omp.body.continue: +// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK4: omp.inner.for.inc: +// CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] +// CHECK4-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK4: omp.inner.for.end: +// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK4: omp.loop.exit: +// CHECK4-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) +// CHECK4-NEXT: br label [[OMP_PRECOND_END]] +// CHECK4: omp.precond.end: +// CHECK4-NEXT: ret void // // -// CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59 -// CHECK15-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { -// CHECK15-NEXT: entry: -// CHECK15-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK15-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK15-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK15-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 -// CHECK15-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK15-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK15-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK15-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK15: .execute: -// CHECK15-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[F_ADDR]], align 4 -// CHECK15-NEXT: store i32 [[TMP2]], i32* [[F_CASTED]], align 4 -// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[F_CASTED]], align 4 -// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK15-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i32 [[TMP3]]) #[[ATTR2]] -// CHECK15-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK15: .omp.deinit: -// CHECK15-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK15-NEXT: br label [[DOTEXIT:%.*]] -// CHECK15: .exit: -// CHECK15-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l39 +// CHECK4-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK4-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK4: .execute: +// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK4-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] +// CHECK4-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK4: .omp.deinit: +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK4-NEXT: br label [[DOTEXIT:%.*]] +// CHECK4: .exit: +// CHECK4-NEXT: ret void // // -// CHECK15-LABEL: define {{[^@]+}}@__omp_outlined__6 -// CHECK15-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { -// CHECK15-NEXT: entry: -// CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK15-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK15-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[K:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 -// CHECK15-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK15-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK15-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK15-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 -// CHECK15-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK15-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK15-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK15-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 -// CHECK15-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 -// CHECK15-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK15: cond.true: -// CHECK15-NEXT: br label [[COND_END:%.*]] -// CHECK15: cond.false: -// CHECK15-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: br label [[COND_END]] -// CHECK15: cond.end: -// CHECK15-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] -// CHECK15-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK15-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK15: omp.inner.for.cond: -// CHECK15-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 -// CHECK15-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK15: omp.inner.for.body: -// CHECK15-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK15-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: [[TMP9:%.*]] = load i32, i32* [[F_ADDR]], align 4 -// CHECK15-NEXT: store i32 [[TMP9]], i32* [[F_CASTED]], align 4 -// CHECK15-NEXT: [[TMP10:%.*]] = load i32, i32* [[F_CASTED]], align 4 -// CHECK15-NEXT: [[TMP11:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK15-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP7]] to i8* -// CHECK15-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 -// CHECK15-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK15-NEXT: [[TMP14:%.*]] = inttoptr i32 [[TMP8]] to i8* -// CHECK15-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 -// CHECK15-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK15-NEXT: [[TMP16:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* -// CHECK15-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 4 -// CHECK15-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK15-NEXT: [[TMP18:%.*]] = inttoptr i32 [[TMP10]] to i8* -// CHECK15-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 4 -// CHECK15-NEXT: [[TMP19:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK15-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x [10 x i32]]*, i32)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP19]], i32 4) -// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK15: omp.inner.for.inc: -// CHECK15-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK15-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] -// CHECK15-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK15-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK15-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK15-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK15-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK15-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] -// CHECK15-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP26]], 99 -// CHECK15-NEXT: br i1 [[CMP5]], label [[COND_TRUE6:%.*]], label [[COND_FALSE7:%.*]] -// CHECK15: cond.true6: -// CHECK15-NEXT: br label [[COND_END8:%.*]] -// CHECK15: cond.false7: -// CHECK15-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: br label [[COND_END8]] -// CHECK15: cond.end8: -// CHECK15-NEXT: [[COND9:%.*]] = phi i32 [ 99, [[COND_TRUE6]] ], [ [[TMP27]], [[COND_FALSE7]] ] -// CHECK15-NEXT: store i32 [[COND9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK15-NEXT: store i32 [[TMP28]], i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK15: omp.inner.for.end: -// CHECK15-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK15: omp.loop.exit: -// CHECK15-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) -// CHECK15-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK4-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK4-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 +// CHECK4-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK4: cond.true: +// CHECK4-NEXT: br label [[COND_END:%.*]] +// CHECK4: cond.false: +// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: br label [[COND_END]] +// CHECK4: cond.end: +// CHECK4-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK4-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK4: omp.inner.for.cond: +// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 +// CHECK4-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK4: omp.inner.for.body: +// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK4-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK4-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK4-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK4-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK4-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK4-NEXT: [[TMP15:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP15]], i32 3) +// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK4: omp.inner.for.inc: +// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP17]] +// CHECK4-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] +// CHECK4-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK4-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP22]], 9 +// CHECK4-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] +// CHECK4: cond.true5: +// CHECK4-NEXT: br label [[COND_END7:%.*]] +// CHECK4: cond.false6: +// CHECK4-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: br label [[COND_END7]] +// CHECK4: cond.end7: +// CHECK4-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP23]], [[COND_FALSE6]] ] +// CHECK4-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP24]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK4: omp.inner.for.end: +// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK4: omp.loop.exit: +// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK4-NEXT: ret void // // -// CHECK15-LABEL: define {{[^@]+}}@__omp_outlined__7 -// CHECK15-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { -// CHECK15-NEXT: entry: -// CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK15-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK15-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[K:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK15-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK15-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK15-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK15-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK15-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 -// CHECK15-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK15-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 -// CHECK15-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 -// CHECK15-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 -// CHECK15-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK15-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK15-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 -// CHECK15-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK15-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK15-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK15: omp.inner.for.cond: -// CHECK15-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK15-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] -// CHECK15-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK15: omp.inner.for.body: -// CHECK15-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 -// CHECK15-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 -// CHECK15-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK15-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK15-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: [[DIV2:%.*]] = sdiv i32 [[TMP10]], 10 -// CHECK15-NEXT: [[MUL3:%.*]] = mul nsw i32 [[DIV2]], 10 -// CHECK15-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL3]] -// CHECK15-NEXT: [[MUL4:%.*]] = mul nsw i32 [[SUB]], 1 -// CHECK15-NEXT: [[ADD5:%.*]] = add nsw i32 0, [[MUL4]] -// CHECK15-NEXT: store i32 [[ADD5]], i32* [[J]], align 4 -// CHECK15-NEXT: store i32 10, i32* [[K]], align 4 -// CHECK15-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 -// CHECK15-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 -// CHECK15-NEXT: [[TMP13:%.*]] = load i32, i32* [[F_ADDR]], align 4 -// CHECK15-NEXT: [[MUL6:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] -// CHECK15-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP11]], [[MUL6]] -// CHECK15-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 -// CHECK15-NEXT: [[ADD8:%.*]] = add nsw i32 [[ADD7]], [[TMP14]] -// CHECK15-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 -// CHECK15-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP15]] -// CHECK15-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 -// CHECK15-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP16]] -// CHECK15-NEXT: store i32 [[ADD8]], i32* [[ARRAYIDX9]], align 4 -// CHECK15-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK15: omp.body.continue: -// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK15: omp.inner.for.inc: -// CHECK15-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK15-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] -// CHECK15-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK15: omp.inner.for.end: -// CHECK15-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK15: omp.loop.exit: -// CHECK15-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) -// CHECK15-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__5 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK4: omp.inner.for.cond: +// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK4: omp.inner.for.body: +// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 +// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK4-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 +// CHECK4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 [[TMP9]] +// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK4-NEXT: [[ADD1:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK4-NEXT: store i32 [[ADD1]], i32* [[ARRAYIDX]], align 4 +// CHECK4-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK4: omp.body.continue: +// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK4: omp.inner.for.inc: +// CHECK4-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] +// CHECK4-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK4: omp.inner.for.end: +// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK4: omp.loop.exit: +// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK4-NEXT: ret void // // -// CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l67 -// CHECK15-SAME: (i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { -// CHECK15-NEXT: entry: -// CHECK15-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK15-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK15-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK15-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK15-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK15-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK15-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK15-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK15: .execute: -// CHECK15-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK15-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 -// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK15-NEXT: call void @__omp_outlined__8(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [10 x [10 x i32]]* [[TMP0]]) #[[ATTR2]] -// CHECK15-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK15: .omp.deinit: -// CHECK15-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK15-NEXT: br label [[DOTEXIT:%.*]] -// CHECK15: .exit: -// CHECK15-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44 +// CHECK4-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK4-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK4-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK4: .execute: +// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP2]], i32* [[F_CASTED]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK4-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i32 [[TMP3]]) #[[ATTR2]] +// CHECK4-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK4: .omp.deinit: +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK4-NEXT: br label [[DOTEXIT:%.*]] +// CHECK4: .exit: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__6 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK4-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK4-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK4-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 +// CHECK4-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK4: cond.true: +// CHECK4-NEXT: br label [[COND_END:%.*]] +// CHECK4: cond.false: +// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: br label [[COND_END]] +// CHECK4: cond.end: +// CHECK4-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK4-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK4: omp.inner.for.cond: +// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 +// CHECK4-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK4: omp.inner.for.body: +// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP9]], i32* [[F_CASTED]], align 4 +// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK4-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK4-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP14:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK4-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK4-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK4-NEXT: [[TMP16:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK4-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 4 +// CHECK4-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK4-NEXT: [[TMP18:%.*]] = inttoptr i32 [[TMP10]] to i8* +// CHECK4-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 4 +// CHECK4-NEXT: [[TMP19:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x [10 x i32]]*, i32)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP19]], i32 4) +// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK4: omp.inner.for.inc: +// CHECK4-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK4-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK4-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK4-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP26]], 99 +// CHECK4-NEXT: br i1 [[CMP5]], label [[COND_TRUE6:%.*]], label [[COND_FALSE7:%.*]] +// CHECK4: cond.true6: +// CHECK4-NEXT: br label [[COND_END8:%.*]] +// CHECK4: cond.false7: +// CHECK4-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: br label [[COND_END8]] +// CHECK4: cond.end8: +// CHECK4-NEXT: [[COND9:%.*]] = phi i32 [ 99, [[COND_TRUE6]] ], [ [[TMP27]], [[COND_FALSE7]] ] +// CHECK4-NEXT: store i32 [[COND9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP28]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK4: omp.inner.for.end: +// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK4: omp.loop.exit: +// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK4-NEXT: ret void // // -// CHECK15-LABEL: define {{[^@]+}}@__omp_outlined__8 -// CHECK15-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { -// CHECK15-NEXT: entry: -// CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK15-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK15-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 -// CHECK15-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 -// CHECK15-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i64, align 8 -// CHECK15-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i64, align 8 -// CHECK15-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 -// CHECK15-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[I9:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[J10:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 -// CHECK15-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK15-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK15-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK15-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK15-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK15-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK15-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK15-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 -// CHECK15-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK15-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64 -// CHECK15-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK15-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 -// CHECK15-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 -// CHECK15-NEXT: [[CONV6:%.*]] = sext i32 [[DIV5]] to i64 -// CHECK15-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV6]] -// CHECK15-NEXT: [[SUB7:%.*]] = sub nsw i64 [[MUL]], 1 -// CHECK15-NEXT: store i64 [[SUB7]], i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK15-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK15-NEXT: store i32 0, i32* [[J]], align 4 -// CHECK15-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK15-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] -// CHECK15-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK15: land.lhs.true: -// CHECK15-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK15-NEXT: [[CMP8:%.*]] = icmp slt i32 0, [[TMP6]] -// CHECK15-NEXT: br i1 [[CMP8]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] -// CHECK15: omp.precond.then: -// CHECK15-NEXT: store i64 0, i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK15-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK15-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK15-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK15-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK15-NEXT: [[CONV11:%.*]] = zext i32 [[NVPTX_NUM_THREADS]] to i64 -// CHECK15-NEXT: [[TMP8:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK15-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4 -// CHECK15-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB1]], i32 [[TMP9]], i32 91, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_COMB_LB]], i64* [[DOTOMP_COMB_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 [[CONV11]]) -// CHECK15-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK15-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK15-NEXT: [[CMP12:%.*]] = icmp sgt i64 [[TMP10]], [[TMP11]] -// CHECK15-NEXT: br i1 [[CMP12]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK15: cond.true: -// CHECK15-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK15-NEXT: br label [[COND_END:%.*]] -// CHECK15: cond.false: -// CHECK15-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK15-NEXT: br label [[COND_END]] -// CHECK15: cond.end: -// CHECK15-NEXT: [[COND:%.*]] = phi i64 [ [[TMP12]], [[COND_TRUE]] ], [ [[TMP13]], [[COND_FALSE]] ] -// CHECK15-NEXT: store i64 [[COND]], i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK15-NEXT: [[TMP14:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK15-NEXT: store i64 [[TMP14]], i64* [[DOTOMP_IV]], align 8 -// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK15: omp.inner.for.cond: -// CHECK15-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK15-NEXT: [[TMP16:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK15-NEXT: [[ADD:%.*]] = add nsw i64 [[TMP16]], 1 -// CHECK15-NEXT: [[CMP13:%.*]] = icmp slt i64 [[TMP15]], [[ADD]] -// CHECK15-NEXT: br i1 [[CMP13]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK15: omp.inner.for.body: -// CHECK15-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK15-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32 -// CHECK15-NEXT: [[TMP19:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK15-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32 -// CHECK15-NEXT: [[TMP21:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK15-NEXT: store i32 [[TMP21]], i32* [[N_CASTED]], align 4 -// CHECK15-NEXT: [[TMP22:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK15-NEXT: [[TMP23:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK15-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP18]] to i8* -// CHECK15-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 -// CHECK15-NEXT: [[TMP25:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK15-NEXT: [[TMP26:%.*]] = inttoptr i32 [[TMP20]] to i8* -// CHECK15-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 -// CHECK15-NEXT: [[TMP27:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK15-NEXT: [[TMP28:%.*]] = inttoptr i32 [[TMP22]] to i8* -// CHECK15-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 -// CHECK15-NEXT: [[TMP29:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK15-NEXT: [[TMP30:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* -// CHECK15-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 4 -// CHECK15-NEXT: [[TMP31:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK15-NEXT: [[TMP32:%.*]] = load i32, i32* [[TMP31]], align 4 -// CHECK15-NEXT: [[TMP33:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK15-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP32]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [10 x [10 x i32]]*)* @__omp_outlined__9 to i8*), i8* null, i8** [[TMP33]], i32 4) -// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK15: omp.inner.for.inc: -// CHECK15-NEXT: [[TMP34:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK15-NEXT: [[TMP35:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK15-NEXT: [[ADD14:%.*]] = add nsw i64 [[TMP34]], [[TMP35]] -// CHECK15-NEXT: store i64 [[ADD14]], i64* [[DOTOMP_IV]], align 8 -// CHECK15-NEXT: [[TMP36:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK15-NEXT: [[TMP37:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK15-NEXT: [[ADD15:%.*]] = add nsw i64 [[TMP36]], [[TMP37]] -// CHECK15-NEXT: store i64 [[ADD15]], i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK15-NEXT: [[TMP38:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK15-NEXT: [[TMP39:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK15-NEXT: [[ADD16:%.*]] = add nsw i64 [[TMP38]], [[TMP39]] -// CHECK15-NEXT: store i64 [[ADD16]], i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK15-NEXT: [[TMP40:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK15-NEXT: [[TMP41:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK15-NEXT: [[CMP17:%.*]] = icmp sgt i64 [[TMP40]], [[TMP41]] -// CHECK15-NEXT: br i1 [[CMP17]], label [[COND_TRUE18:%.*]], label [[COND_FALSE19:%.*]] -// CHECK15: cond.true18: -// CHECK15-NEXT: [[TMP42:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK15-NEXT: br label [[COND_END20:%.*]] -// CHECK15: cond.false19: -// CHECK15-NEXT: [[TMP43:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK15-NEXT: br label [[COND_END20]] -// CHECK15: cond.end20: -// CHECK15-NEXT: [[COND21:%.*]] = phi i64 [ [[TMP42]], [[COND_TRUE18]] ], [ [[TMP43]], [[COND_FALSE19]] ] -// CHECK15-NEXT: store i64 [[COND21]], i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK15-NEXT: [[TMP44:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK15-NEXT: store i64 [[TMP44]], i64* [[DOTOMP_IV]], align 8 -// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK15: omp.inner.for.end: -// CHECK15-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK15: omp.loop.exit: -// CHECK15-NEXT: [[TMP45:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK15-NEXT: [[TMP46:%.*]] = load i32, i32* [[TMP45]], align 4 -// CHECK15-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP46]]) -// CHECK15-NEXT: br label [[OMP_PRECOND_END]] -// CHECK15: omp.precond.end: -// CHECK15-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__7 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK4-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK4: omp.inner.for.cond: +// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK4: omp.inner.for.body: +// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 +// CHECK4-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 +// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK4-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[DIV2:%.*]] = sdiv i32 [[TMP10]], 10 +// CHECK4-NEXT: [[MUL3:%.*]] = mul nsw i32 [[DIV2]], 10 +// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL3]] +// CHECK4-NEXT: [[MUL4:%.*]] = mul nsw i32 [[SUB]], 1 +// CHECK4-NEXT: [[ADD5:%.*]] = add nsw i32 0, [[MUL4]] +// CHECK4-NEXT: store i32 [[ADD5]], i32* [[J]], align 4 +// CHECK4-NEXT: store i32 10, i32* [[K]], align 4 +// CHECK4-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 +// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 +// CHECK4-NEXT: [[TMP13:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK4-NEXT: [[MUL6:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] +// CHECK4-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP11]], [[MUL6]] +// CHECK4-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 +// CHECK4-NEXT: [[ADD8:%.*]] = add nsw i32 [[ADD7]], [[TMP14]] +// CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 +// CHECK4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP15]] +// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 +// CHECK4-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP16]] +// CHECK4-NEXT: store i32 [[ADD8]], i32* [[ARRAYIDX9]], align 4 +// CHECK4-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK4: omp.body.continue: +// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK4: omp.inner.for.inc: +// CHECK4-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK4-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK4: omp.inner.for.end: +// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK4: omp.loop.exit: +// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK4-NEXT: ret void // // -// CHECK15-LABEL: define {{[^@]+}}@__omp_outlined__9 -// CHECK15-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { -// CHECK15-NEXT: entry: -// CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK15-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK15-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 -// CHECK15-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 -// CHECK15-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_LB:%.*]] = alloca i64, align 8 -// CHECK15-NEXT: [[DOTOMP_UB:%.*]] = alloca i64, align 8 -// CHECK15-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 -// CHECK15-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[I11:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[J12:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK15-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK15-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK15-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK15-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK15-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK15-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK15-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK15-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK15-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 -// CHECK15-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK15-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64 -// CHECK15-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK15-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 -// CHECK15-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 -// CHECK15-NEXT: [[CONV6:%.*]] = sext i32 [[DIV5]] to i64 -// CHECK15-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV6]] -// CHECK15-NEXT: [[SUB7:%.*]] = sub nsw i64 [[MUL]], 1 -// CHECK15-NEXT: store i64 [[SUB7]], i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK15-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK15-NEXT: store i32 0, i32* [[J]], align 4 -// CHECK15-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK15-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] -// CHECK15-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK15: land.lhs.true: -// CHECK15-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK15-NEXT: [[CMP8:%.*]] = icmp slt i32 0, [[TMP6]] -// CHECK15-NEXT: br i1 [[CMP8]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] -// CHECK15: omp.precond.then: -// CHECK15-NEXT: store i64 0, i64* [[DOTOMP_LB]], align 8 -// CHECK15-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK15-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_UB]], align 8 -// CHECK15-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK15-NEXT: [[CONV9:%.*]] = zext i32 [[TMP8]] to i64 -// CHECK15-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK15-NEXT: [[CONV10:%.*]] = zext i32 [[TMP9]] to i64 -// CHECK15-NEXT: store i64 [[CONV9]], i64* [[DOTOMP_LB]], align 8 -// CHECK15-NEXT: store i64 [[CONV10]], i64* [[DOTOMP_UB]], align 8 -// CHECK15-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK15-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK15-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 -// CHECK15-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB2]], i32 [[TMP11]], i32 33, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_LB]], i64* [[DOTOMP_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 1) -// CHECK15-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTOMP_LB]], align 8 -// CHECK15-NEXT: store i64 [[TMP12]], i64* [[DOTOMP_IV]], align 8 -// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK15: omp.inner.for.cond: -// CHECK15-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK15-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK15-NEXT: [[CONV13:%.*]] = zext i32 [[TMP14]] to i64 -// CHECK15-NEXT: [[CMP14:%.*]] = icmp sle i64 [[TMP13]], [[CONV13]] -// CHECK15-NEXT: br i1 [[CMP14]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK15: omp.inner.for.body: -// CHECK15-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK15-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK15-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP16]], 0 -// CHECK15-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1 -// CHECK15-NEXT: [[MUL17:%.*]] = mul nsw i32 1, [[DIV16]] -// CHECK15-NEXT: [[CONV18:%.*]] = sext i32 [[MUL17]] to i64 -// CHECK15-NEXT: [[DIV19:%.*]] = sdiv i64 [[TMP15]], [[CONV18]] -// CHECK15-NEXT: [[MUL20:%.*]] = mul nsw i64 [[DIV19]], 1 -// CHECK15-NEXT: [[ADD:%.*]] = add nsw i64 0, [[MUL20]] -// CHECK15-NEXT: [[CONV21:%.*]] = trunc i64 [[ADD]] to i32 -// CHECK15-NEXT: store i32 [[CONV21]], i32* [[I11]], align 4 -// CHECK15-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK15-NEXT: [[TMP18:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK15-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK15-NEXT: [[SUB22:%.*]] = sub nsw i32 [[TMP19]], 0 -// CHECK15-NEXT: [[DIV23:%.*]] = sdiv i32 [[SUB22]], 1 -// CHECK15-NEXT: [[MUL24:%.*]] = mul nsw i32 1, [[DIV23]] -// CHECK15-NEXT: [[CONV25:%.*]] = sext i32 [[MUL24]] to i64 -// CHECK15-NEXT: [[DIV26:%.*]] = sdiv i64 [[TMP18]], [[CONV25]] -// CHECK15-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK15-NEXT: [[SUB27:%.*]] = sub nsw i32 [[TMP20]], 0 -// CHECK15-NEXT: [[DIV28:%.*]] = sdiv i32 [[SUB27]], 1 -// CHECK15-NEXT: [[MUL29:%.*]] = mul nsw i32 1, [[DIV28]] -// CHECK15-NEXT: [[CONV30:%.*]] = sext i32 [[MUL29]] to i64 -// CHECK15-NEXT: [[MUL31:%.*]] = mul nsw i64 [[DIV26]], [[CONV30]] -// CHECK15-NEXT: [[SUB32:%.*]] = sub nsw i64 [[TMP17]], [[MUL31]] -// CHECK15-NEXT: [[MUL33:%.*]] = mul nsw i64 [[SUB32]], 1 -// CHECK15-NEXT: [[ADD34:%.*]] = add nsw i64 0, [[MUL33]] -// CHECK15-NEXT: [[CONV35:%.*]] = trunc i64 [[ADD34]] to i32 -// CHECK15-NEXT: store i32 [[CONV35]], i32* [[J12]], align 4 -// CHECK15-NEXT: [[TMP21:%.*]] = load i32, i32* [[I11]], align 4 -// CHECK15-NEXT: [[TMP22:%.*]] = load i32, i32* [[J12]], align 4 -// CHECK15-NEXT: [[ADD36:%.*]] = add nsw i32 [[TMP21]], [[TMP22]] -// CHECK15-NEXT: [[TMP23:%.*]] = load i32, i32* [[I11]], align 4 -// CHECK15-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP23]] -// CHECK15-NEXT: [[TMP24:%.*]] = load i32, i32* [[J12]], align 4 -// CHECK15-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP24]] -// CHECK15-NEXT: store i32 [[ADD36]], i32* [[ARRAYIDX37]], align 4 -// CHECK15-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK15: omp.body.continue: -// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK15: omp.inner.for.inc: -// CHECK15-NEXT: [[TMP25:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK15-NEXT: [[TMP26:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK15-NEXT: [[ADD38:%.*]] = add nsw i64 [[TMP25]], [[TMP26]] -// CHECK15-NEXT: store i64 [[ADD38]], i64* [[DOTOMP_IV]], align 8 -// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK15: omp.inner.for.end: -// CHECK15-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK15: omp.loop.exit: -// CHECK15-NEXT: [[TMP27:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK15-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP27]], align 4 -// CHECK15-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP28]]) -// CHECK15-NEXT: br label [[OMP_PRECOND_END]] -// CHECK15: omp.precond.end: -// CHECK15-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l52 +// CHECK4-SAME: (i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK4-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK4: .execute: +// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK4-NEXT: call void @__omp_outlined__8(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [10 x [10 x i32]]* [[TMP0]]) #[[ATTR2]] +// CHECK4-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK4: .omp.deinit: +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK4-NEXT: br label [[DOTEXIT:%.*]] +// CHECK4: .exit: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__8 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I9:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[J10:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 +// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK4-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64 +// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK4-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 +// CHECK4-NEXT: [[CONV6:%.*]] = sext i32 [[DIV5]] to i64 +// CHECK4-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV6]] +// CHECK4-NEXT: [[SUB7:%.*]] = sub nsw i64 [[MUL]], 1 +// CHECK4-NEXT: store i64 [[SUB7]], i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK4-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[J]], align 4 +// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK4-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK4: land.lhs.true: +// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: [[CMP8:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK4-NEXT: br i1 [[CMP8]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] +// CHECK4: omp.precond.then: +// CHECK4-NEXT: store i64 0, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK4-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK4-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK4-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK4-NEXT: [[CONV11:%.*]] = zext i32 [[NVPTX_NUM_THREADS]] to i64 +// CHECK4-NEXT: [[TMP8:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB1]], i32 [[TMP9]], i32 91, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_COMB_LB]], i64* [[DOTOMP_COMB_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 [[CONV11]]) +// CHECK4-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK4-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK4-NEXT: [[CMP12:%.*]] = icmp sgt i64 [[TMP10]], [[TMP11]] +// CHECK4-NEXT: br i1 [[CMP12]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK4: cond.true: +// CHECK4-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK4-NEXT: br label [[COND_END:%.*]] +// CHECK4: cond.false: +// CHECK4-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK4-NEXT: br label [[COND_END]] +// CHECK4: cond.end: +// CHECK4-NEXT: [[COND:%.*]] = phi i64 [ [[TMP12]], [[COND_TRUE]] ], [ [[TMP13]], [[COND_FALSE]] ] +// CHECK4-NEXT: store i64 [[COND]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK4-NEXT: [[TMP14:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK4-NEXT: store i64 [[TMP14]], i64* [[DOTOMP_IV]], align 8 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK4: omp.inner.for.cond: +// CHECK4-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK4-NEXT: [[TMP16:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK4-NEXT: [[ADD:%.*]] = add nsw i64 [[TMP16]], 1 +// CHECK4-NEXT: [[CMP13:%.*]] = icmp slt i64 [[TMP15]], [[ADD]] +// CHECK4-NEXT: br i1 [[CMP13]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK4: omp.inner.for.body: +// CHECK4-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK4-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32 +// CHECK4-NEXT: [[TMP19:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK4-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32 +// CHECK4-NEXT: [[TMP21:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP21]], i32* [[N_CASTED]], align 4 +// CHECK4-NEXT: [[TMP22:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK4-NEXT: [[TMP23:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP18]] to i8* +// CHECK4-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 +// CHECK4-NEXT: [[TMP25:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP26:%.*]] = inttoptr i32 [[TMP20]] to i8* +// CHECK4-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 +// CHECK4-NEXT: [[TMP27:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK4-NEXT: [[TMP28:%.*]] = inttoptr i32 [[TMP22]] to i8* +// CHECK4-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 +// CHECK4-NEXT: [[TMP29:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK4-NEXT: [[TMP30:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK4-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 4 +// CHECK4-NEXT: [[TMP31:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP32:%.*]] = load i32, i32* [[TMP31]], align 4 +// CHECK4-NEXT: [[TMP33:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP32]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [10 x [10 x i32]]*)* @__omp_outlined__9 to i8*), i8* null, i8** [[TMP33]], i32 4) +// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK4: omp.inner.for.inc: +// CHECK4-NEXT: [[TMP34:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK4-NEXT: [[TMP35:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK4-NEXT: [[ADD14:%.*]] = add nsw i64 [[TMP34]], [[TMP35]] +// CHECK4-NEXT: store i64 [[ADD14]], i64* [[DOTOMP_IV]], align 8 +// CHECK4-NEXT: [[TMP36:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK4-NEXT: [[TMP37:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK4-NEXT: [[ADD15:%.*]] = add nsw i64 [[TMP36]], [[TMP37]] +// CHECK4-NEXT: store i64 [[ADD15]], i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK4-NEXT: [[TMP38:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK4-NEXT: [[TMP39:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK4-NEXT: [[ADD16:%.*]] = add nsw i64 [[TMP38]], [[TMP39]] +// CHECK4-NEXT: store i64 [[ADD16]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK4-NEXT: [[TMP40:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK4-NEXT: [[TMP41:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK4-NEXT: [[CMP17:%.*]] = icmp sgt i64 [[TMP40]], [[TMP41]] +// CHECK4-NEXT: br i1 [[CMP17]], label [[COND_TRUE18:%.*]], label [[COND_FALSE19:%.*]] +// CHECK4: cond.true18: +// CHECK4-NEXT: [[TMP42:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK4-NEXT: br label [[COND_END20:%.*]] +// CHECK4: cond.false19: +// CHECK4-NEXT: [[TMP43:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK4-NEXT: br label [[COND_END20]] +// CHECK4: cond.end20: +// CHECK4-NEXT: [[COND21:%.*]] = phi i64 [ [[TMP42]], [[COND_TRUE18]] ], [ [[TMP43]], [[COND_FALSE19]] ] +// CHECK4-NEXT: store i64 [[COND21]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK4-NEXT: [[TMP44:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK4-NEXT: store i64 [[TMP44]], i64* [[DOTOMP_IV]], align 8 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK4: omp.inner.for.end: +// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK4: omp.loop.exit: +// CHECK4-NEXT: [[TMP45:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP46:%.*]] = load i32, i32* [[TMP45]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP46]]) +// CHECK4-NEXT: br label [[OMP_PRECOND_END]] +// CHECK4: omp.precond.end: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__9 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_LB:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[DOTOMP_UB:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I11:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[J12:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 +// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK4-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64 +// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK4-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 +// CHECK4-NEXT: [[CONV6:%.*]] = sext i32 [[DIV5]] to i64 +// CHECK4-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV6]] +// CHECK4-NEXT: [[SUB7:%.*]] = sub nsw i64 [[MUL]], 1 +// CHECK4-NEXT: store i64 [[SUB7]], i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK4-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[J]], align 4 +// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK4-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK4: land.lhs.true: +// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: [[CMP8:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK4-NEXT: br i1 [[CMP8]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] +// CHECK4: omp.precond.then: +// CHECK4-NEXT: store i64 0, i64* [[DOTOMP_LB]], align 8 +// CHECK4-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK4-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_UB]], align 8 +// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK4-NEXT: [[CONV9:%.*]] = zext i32 [[TMP8]] to i64 +// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: [[CONV10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK4-NEXT: store i64 [[CONV9]], i64* [[DOTOMP_LB]], align 8 +// CHECK4-NEXT: store i64 [[CONV10]], i64* [[DOTOMP_UB]], align 8 +// CHECK4-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB2]], i32 [[TMP11]], i32 33, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_LB]], i64* [[DOTOMP_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 1) +// CHECK4-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTOMP_LB]], align 8 +// CHECK4-NEXT: store i64 [[TMP12]], i64* [[DOTOMP_IV]], align 8 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK4: omp.inner.for.cond: +// CHECK4-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK4-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: [[CONV13:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK4-NEXT: [[CMP14:%.*]] = icmp sle i64 [[TMP13]], [[CONV13]] +// CHECK4-NEXT: br i1 [[CMP14]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK4: omp.inner.for.body: +// CHECK4-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP16]], 0 +// CHECK4-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1 +// CHECK4-NEXT: [[MUL17:%.*]] = mul nsw i32 1, [[DIV16]] +// CHECK4-NEXT: [[CONV18:%.*]] = sext i32 [[MUL17]] to i64 +// CHECK4-NEXT: [[DIV19:%.*]] = sdiv i64 [[TMP15]], [[CONV18]] +// CHECK4-NEXT: [[MUL20:%.*]] = mul nsw i64 [[DIV19]], 1 +// CHECK4-NEXT: [[ADD:%.*]] = add nsw i64 0, [[MUL20]] +// CHECK4-NEXT: [[CONV21:%.*]] = trunc i64 [[ADD]] to i32 +// CHECK4-NEXT: store i32 [[CONV21]], i32* [[I11]], align 4 +// CHECK4-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK4-NEXT: [[TMP18:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK4-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: [[SUB22:%.*]] = sub nsw i32 [[TMP19]], 0 +// CHECK4-NEXT: [[DIV23:%.*]] = sdiv i32 [[SUB22]], 1 +// CHECK4-NEXT: [[MUL24:%.*]] = mul nsw i32 1, [[DIV23]] +// CHECK4-NEXT: [[CONV25:%.*]] = sext i32 [[MUL24]] to i64 +// CHECK4-NEXT: [[DIV26:%.*]] = sdiv i64 [[TMP18]], [[CONV25]] +// CHECK4-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: [[SUB27:%.*]] = sub nsw i32 [[TMP20]], 0 +// CHECK4-NEXT: [[DIV28:%.*]] = sdiv i32 [[SUB27]], 1 +// CHECK4-NEXT: [[MUL29:%.*]] = mul nsw i32 1, [[DIV28]] +// CHECK4-NEXT: [[CONV30:%.*]] = sext i32 [[MUL29]] to i64 +// CHECK4-NEXT: [[MUL31:%.*]] = mul nsw i64 [[DIV26]], [[CONV30]] +// CHECK4-NEXT: [[SUB32:%.*]] = sub nsw i64 [[TMP17]], [[MUL31]] +// CHECK4-NEXT: [[MUL33:%.*]] = mul nsw i64 [[SUB32]], 1 +// CHECK4-NEXT: [[ADD34:%.*]] = add nsw i64 0, [[MUL33]] +// CHECK4-NEXT: [[CONV35:%.*]] = trunc i64 [[ADD34]] to i32 +// CHECK4-NEXT: store i32 [[CONV35]], i32* [[J12]], align 4 +// CHECK4-NEXT: [[TMP21:%.*]] = load i32, i32* [[I11]], align 4 +// CHECK4-NEXT: [[TMP22:%.*]] = load i32, i32* [[J12]], align 4 +// CHECK4-NEXT: [[ADD36:%.*]] = add nsw i32 [[TMP21]], [[TMP22]] +// CHECK4-NEXT: [[TMP23:%.*]] = load i32, i32* [[I11]], align 4 +// CHECK4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP23]] +// CHECK4-NEXT: [[TMP24:%.*]] = load i32, i32* [[J12]], align 4 +// CHECK4-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP24]] +// CHECK4-NEXT: store i32 [[ADD36]], i32* [[ARRAYIDX37]], align 4 +// CHECK4-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK4: omp.body.continue: +// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK4: omp.inner.for.inc: +// CHECK4-NEXT: [[TMP25:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK4-NEXT: [[TMP26:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK4-NEXT: [[ADD38:%.*]] = add nsw i64 [[TMP25]], [[TMP26]] +// CHECK4-NEXT: store i64 [[ADD38]], i64* [[DOTOMP_IV]], align 8 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK4: omp.inner.for.end: +// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK4: omp.loop.exit: +// CHECK4-NEXT: [[TMP27:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP27]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP28]]) +// CHECK4-NEXT: br label [[OMP_PRECOND_END]] +// CHECK4: omp.precond.end: +// CHECK4-NEXT: ret void // // -// CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74 -// CHECK15-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { -// CHECK15-NEXT: entry: -// CHECK15-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK15-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 -// CHECK15-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK15-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK15-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK15-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 -// CHECK15-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK15-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK15-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK15-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK15: .execute: -// CHECK15-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK15-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 -// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK15-NEXT: [[TMP4:%.*]] = load i32*, i32** [[V_ADDR]], align 4 -// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK15-NEXT: call void @__omp_outlined__10(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32* [[TMP4]]) #[[ATTR2]] -// CHECK15-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK15: .omp.deinit: -// CHECK15-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK15-NEXT: br label [[DOTEXIT:%.*]] -// CHECK15: .exit: -// CHECK15-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59 +// CHECK4-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK4-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK4-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK4: .execute: +// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = load i32*, i32** [[V_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK4-NEXT: call void @__omp_outlined__10(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32* [[TMP4]]) #[[ATTR2]] +// CHECK4-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK4: .omp.deinit: +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK4-NEXT: br label [[DOTEXIT:%.*]] +// CHECK4: .exit: +// CHECK4-NEXT: ret void // // -// CHECK15-LABEL: define {{[^@]+}}@__omp_outlined__10 -// CHECK15-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { -// CHECK15-NEXT: entry: -// CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK15-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK15-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 -// CHECK15-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 -// CHECK15-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK15-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK15-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK15-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK15-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 -// CHECK15-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK15-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK15-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK15-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK15-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK15-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK15-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK15-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK15-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK15: omp.precond.then: -// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK15-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK15-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK15-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK15-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK15-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 -// CHECK15-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK15-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK15-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] -// CHECK15-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK15: cond.true: -// CHECK15-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK15-NEXT: br label [[COND_END:%.*]] -// CHECK15: cond.false: -// CHECK15-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: br label [[COND_END]] -// CHECK15: cond.end: -// CHECK15-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] -// CHECK15-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK15-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK15: omp.inner.for.cond: -// CHECK15-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK15-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 -// CHECK15-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] -// CHECK15-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK15: omp.inner.for.body: -// CHECK15-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK15-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK15-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 -// CHECK15-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK15-NEXT: [[TMP18:%.*]] = load i32*, i32** [[V_ADDR]], align 4 -// CHECK15-NEXT: [[TMP19:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK15-NEXT: [[TMP20:%.*]] = inttoptr i32 [[TMP14]] to i8* -// CHECK15-NEXT: store i8* [[TMP20]], i8** [[TMP19]], align 4 -// CHECK15-NEXT: [[TMP21:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK15-NEXT: [[TMP22:%.*]] = inttoptr i32 [[TMP15]] to i8* -// CHECK15-NEXT: store i8* [[TMP22]], i8** [[TMP21]], align 4 -// CHECK15-NEXT: [[TMP23:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK15-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP17]] to i8* -// CHECK15-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 -// CHECK15-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK15-NEXT: [[TMP26:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* -// CHECK15-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 -// CHECK15-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 -// CHECK15-NEXT: [[TMP28:%.*]] = bitcast i32* [[TMP18]] to i8* -// CHECK15-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 -// CHECK15-NEXT: [[TMP29:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK15-NEXT: [[TMP30:%.*]] = load i32, i32* [[TMP29]], align 4 -// CHECK15-NEXT: [[TMP31:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK15-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP30]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32*)* @__omp_outlined__11 to i8*), i8* null, i8** [[TMP31]], i32 5) -// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK15: omp.inner.for.inc: -// CHECK15-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK15-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP32]], [[TMP33]] -// CHECK15-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK15-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK15-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP34]], [[TMP35]] -// CHECK15-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK15-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK15-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP36]], [[TMP37]] -// CHECK15-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK15-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP38]], [[TMP39]] -// CHECK15-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] -// CHECK15: cond.true10: -// CHECK15-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK15-NEXT: br label [[COND_END12:%.*]] -// CHECK15: cond.false11: -// CHECK15-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: br label [[COND_END12]] -// CHECK15: cond.end12: -// CHECK15-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP40]], [[COND_TRUE10]] ], [ [[TMP41]], [[COND_FALSE11]] ] -// CHECK15-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK15-NEXT: store i32 [[TMP42]], i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK15: omp.inner.for.end: -// CHECK15-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK15: omp.loop.exit: -// CHECK15-NEXT: [[TMP43:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK15-NEXT: [[TMP44:%.*]] = load i32, i32* [[TMP43]], align 4 -// CHECK15-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP44]]) -// CHECK15-NEXT: br label [[OMP_PRECOND_END]] -// CHECK15: omp.precond.end: -// CHECK15-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__10 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK4-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK4-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK4-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK4-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK4: omp.precond.then: +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG14]] +// CHECK4-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK4-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK4: cond.true: +// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: br label [[COND_END:%.*]] +// CHECK4: cond.false: +// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: br label [[COND_END]] +// CHECK4: cond.end: +// CHECK4-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK4-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK4: omp.inner.for.cond: +// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK4-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK4-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK4: omp.inner.for.body: +// CHECK4-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 +// CHECK4-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK4-NEXT: [[TMP18:%.*]] = load i32*, i32** [[V_ADDR]], align 4 +// CHECK4-NEXT: [[TMP19:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP20:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK4-NEXT: store i8* [[TMP20]], i8** [[TMP19]], align 4 +// CHECK4-NEXT: [[TMP21:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP22:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK4-NEXT: store i8* [[TMP22]], i8** [[TMP21]], align 4 +// CHECK4-NEXT: [[TMP23:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK4-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK4-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 +// CHECK4-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK4-NEXT: [[TMP26:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK4-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 +// CHECK4-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK4-NEXT: [[TMP28:%.*]] = bitcast i32* [[TMP18]] to i8* +// CHECK4-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 +// CHECK4-NEXT: [[TMP29:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP30:%.*]] = load i32, i32* [[TMP29]], align 4 +// CHECK4-NEXT: [[TMP31:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP30]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32*)* @__omp_outlined__11 to i8*), i8* null, i8** [[TMP31]], i32 5) +// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK4: omp.inner.for.inc: +// CHECK4-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP32]], [[TMP33]] +// CHECK4-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP34]], [[TMP35]] +// CHECK4-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP36]], [[TMP37]] +// CHECK4-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP38]], [[TMP39]] +// CHECK4-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] +// CHECK4: cond.true10: +// CHECK4-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: br label [[COND_END12:%.*]] +// CHECK4: cond.false11: +// CHECK4-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: br label [[COND_END12]] +// CHECK4: cond.end12: +// CHECK4-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP40]], [[COND_TRUE10]] ], [ [[TMP41]], [[COND_FALSE11]] ] +// CHECK4-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP42]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK4: omp.inner.for.end: +// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK4: omp.loop.exit: +// CHECK4-NEXT: [[TMP43:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP44:%.*]] = load i32, i32* [[TMP43]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP44]]) +// CHECK4-NEXT: br label [[OMP_PRECOND_END]] +// CHECK4: omp.precond.end: +// CHECK4-NEXT: ret void // // -// CHECK15-LABEL: define {{[^@]+}}@__omp_outlined__11 -// CHECK15-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { -// CHECK15-NEXT: entry: -// CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK15-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK15-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 -// CHECK15-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK15-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK15-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK15-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK15-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK15-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK15-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 -// CHECK15-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK15-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK15-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK15-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK15-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK15-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK15-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK15-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK15-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK15: omp.precond.then: -// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK15-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK15-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK15-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK15-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK15-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 -// CHECK15-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 -// CHECK15-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK15-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK15-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK15-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK15-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK15-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK15: omp.inner.for.cond: -// CHECK15-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK15-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] -// CHECK15-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK15: omp.inner.for.body: -// CHECK15-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 -// CHECK15-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK15-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 -// CHECK15-NEXT: [[TMP13:%.*]] = load i32*, i32** [[V_ADDR]], align 4 -// CHECK15-NEXT: [[TMP14:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK15-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TMP13]], i32 [[TMP14]] -// CHECK15-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -// CHECK15-NEXT: [[TMP16:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK15-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP16]] -// CHECK15-NEXT: store i32 [[TMP15]], i32* [[ARRAYIDX5]], align 4 -// CHECK15-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK15: omp.body.continue: -// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK15: omp.inner.for.inc: -// CHECK15-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK15-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] -// CHECK15-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK15: omp.inner.for.end: -// CHECK15-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK15: omp.loop.exit: -// CHECK15-NEXT: [[TMP19:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK15-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4 -// CHECK15-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP20]]) -// CHECK15-NEXT: br label [[OMP_PRECOND_END]] -// CHECK15: omp.precond.end: -// CHECK15-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__11 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK4-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK4-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK4-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK4-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK4: omp.precond.then: +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK4: omp.inner.for.cond: +// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] +// CHECK4-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK4: omp.inner.for.body: +// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK4-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK4-NEXT: [[TMP13:%.*]] = load i32*, i32** [[V_ADDR]], align 4 +// CHECK4-NEXT: [[TMP14:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TMP13]], i32 [[TMP14]] +// CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK4-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP16]] +// CHECK4-NEXT: store i32 [[TMP15]], i32* [[ARRAYIDX5]], align 4 +// CHECK4-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK4: omp.body.continue: +// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK4: omp.inner.for.inc: +// CHECK4-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK4-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK4: omp.inner.for.end: +// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK4: omp.loop.exit: +// CHECK4-NEXT: [[TMP19:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP20]]) +// CHECK4-NEXT: br label [[OMP_PRECOND_END]] +// CHECK4: omp.precond.end: +// CHECK4-NEXT: ret void // diff --git a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp --- a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp @@ -2,21 +2,15 @@ // Test target codegen - host bc file has to be created first. // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK1 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK2 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK3 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK4 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK5 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK6 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK2 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK3 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK7 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK8 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK4 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK9 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK10 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK11 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK12 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK5 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK6 // expected-no-diagnostics #ifndef HEADER @@ -71,6397 +65,6 @@ } #endif -// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38 -// CHECK1-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 -// CHECK1-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[L_CASTED:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK1-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 -// CHECK1-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 -// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 -// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* -// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK1: .execute: -// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) -// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: [[CONV2:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK1-NEXT: store i32 [[TMP2]], i32* [[CONV2]], align 4 -// CHECK1-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[CONV1]], align 8 -// CHECK1-NEXT: [[CONV3:%.*]] = bitcast i64* [[L_CASTED]] to i32* -// CHECK1-NEXT: store i32 [[TMP4]], i32* [[CONV3]], align 4 -// CHECK1-NEXT: [[TMP5:%.*]] = load i64, i64* [[L_CASTED]], align 8 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK1-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i32]* [[TMP0]], i64 [[TMP5]]) #[[ATTR2:[0-9]+]] -// CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK1: .omp.deinit: -// CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK1-NEXT: br label [[DOTEXIT:%.*]] -// CHECK1: .exit: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 -// CHECK1-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[I5:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[L_CASTED:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 8 -// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK1-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 -// CHECK1-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 -// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 -// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* -// CHECK1-NEXT: [[TMP1:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK1-NEXT: [[TMP2:%.*]] = load i64, i64* @"_openmp_static_kernel$size", align 8 -// CHECK1-NEXT: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i64 [[TMP2]], i16 [[TMP1]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) -// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 8 -// CHECK1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP3]], i64 0 -// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to %struct._globalized_locals_ty* -// CHECK1-NEXT: [[L2:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP5]], i32 0, i32 0 -// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: store i32 [[TMP6]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP7]], 0 -// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK1-NEXT: [[SUB4:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK1-NEXT: store i32 [[SUB4]], i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK1-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP8]] -// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK1: omp.precond.then: -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK1-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 -// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP11]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) -// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK1-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]] -// CHECK1-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK1: cond.true: -// CHECK1-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK1-NEXT: br label [[COND_END:%.*]] -// CHECK1: cond.false: -// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: br label [[COND_END]] -// CHECK1: cond.end: -// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ [[TMP14]], [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ] -// CHECK1-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: store i32 [[TMP16]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK1: omp.inner.for.cond: -// CHECK1-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], 1 -// CHECK1-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP17]], [[ADD]] -// CHECK1-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK1: omp.inner.for.body: -// CHECK1-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: [[TMP20:%.*]] = zext i32 [[TMP19]] to i64 -// CHECK1-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP22:%.*]] = zext i32 [[TMP21]] to i64 -// CHECK1-NEXT: [[TMP23:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: [[CONV8:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK1-NEXT: store i32 [[TMP23]], i32* [[CONV8]], align 4 -// CHECK1-NEXT: [[TMP24:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK1-NEXT: [[TMP25:%.*]] = load i32, i32* [[CONV1]], align 8 -// CHECK1-NEXT: [[CONV9:%.*]] = bitcast i64* [[L_CASTED]] to i32* -// CHECK1-NEXT: store i32 [[TMP25]], i32* [[CONV9]], align 4 -// CHECK1-NEXT: [[TMP26:%.*]] = load i64, i64* [[L_CASTED]], align 8 -// CHECK1-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP28:%.*]] = inttoptr i64 [[TMP20]] to i8* -// CHECK1-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 8 -// CHECK1-NEXT: [[TMP29:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP30:%.*]] = inttoptr i64 [[TMP22]] to i8* -// CHECK1-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 8 -// CHECK1-NEXT: [[TMP31:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK1-NEXT: [[TMP32:%.*]] = inttoptr i64 [[TMP24]] to i8* -// CHECK1-NEXT: store i8* [[TMP32]], i8** [[TMP31]], align 8 -// CHECK1-NEXT: [[TMP33:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 -// CHECK1-NEXT: [[TMP34:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* -// CHECK1-NEXT: store i8* [[TMP34]], i8** [[TMP33]], align 8 -// CHECK1-NEXT: [[TMP35:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 4 -// CHECK1-NEXT: [[TMP36:%.*]] = inttoptr i64 [[TMP26]] to i8* -// CHECK1-NEXT: store i8* [[TMP36]], i8** [[TMP35]], align 8 -// CHECK1-NEXT: [[TMP37:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP38:%.*]] = load i32, i32* [[TMP37]], align 4 -// CHECK1-NEXT: [[TMP39:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP38]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i32]*, i64)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP39]], i64 5) -// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK1: omp.inner.for.inc: -// CHECK1-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP40]], [[TMP41]] -// CHECK1-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP42]], [[TMP43]] -// CHECK1-NEXT: store i32 [[ADD11]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD12:%.*]] = add nsw i32 [[TMP44]], [[TMP45]] -// CHECK1-NEXT: store i32 [[ADD12]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK1-NEXT: [[CMP13:%.*]] = icmp sgt i32 [[TMP46]], [[TMP47]] -// CHECK1-NEXT: br i1 [[CMP13]], label [[COND_TRUE14:%.*]], label [[COND_FALSE15:%.*]] -// CHECK1: cond.true14: -// CHECK1-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK1-NEXT: br label [[COND_END16:%.*]] -// CHECK1: cond.false15: -// CHECK1-NEXT: [[TMP49:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: br label [[COND_END16]] -// CHECK1: cond.end16: -// CHECK1-NEXT: [[COND17:%.*]] = phi i32 [ [[TMP48]], [[COND_TRUE14]] ], [ [[TMP49]], [[COND_FALSE15]] ] -// CHECK1-NEXT: store i32 [[COND17]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: store i32 [[TMP50]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP10:![0-9]+]] -// CHECK1: omp.inner.for.end: -// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK1: omp.loop.exit: -// CHECK1-NEXT: [[TMP51:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP52:%.*]] = load i32, i32* [[TMP51]], align 4 -// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP52]]) -// CHECK1-NEXT: [[TMP53:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[TMP54:%.*]] = icmp ne i32 [[TMP53]], 0 -// CHECK1-NEXT: br i1 [[TMP54]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK1: .omp.final.then: -// CHECK1-NEXT: [[TMP55:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[SUB18:%.*]] = sub nsw i32 [[TMP55]], 0 -// CHECK1-NEXT: [[DIV19:%.*]] = sdiv i32 [[SUB18]], 1 -// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV19]], 1 -// CHECK1-NEXT: [[ADD20:%.*]] = add nsw i32 0, [[MUL]] -// CHECK1-NEXT: store i32 [[ADD20]], i32* [[I5]], align 4 -// CHECK1-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK1: .omp.final.done: -// CHECK1-NEXT: [[TMP56:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[TMP57:%.*]] = icmp ne i32 [[TMP56]], 0 -// CHECK1-NEXT: br i1 [[TMP57]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK1: .omp.lastprivate.then: -// CHECK1-NEXT: [[TMP58:%.*]] = load i32, i32* [[CONV1]], align 8 -// CHECK1-NEXT: store i32 [[TMP58]], i32* [[CONV1]], align 8 -// CHECK1-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK1: .omp.lastprivate.done: -// CHECK1-NEXT: br label [[OMP_PRECOND_END]] -// CHECK1: omp.precond.end: -// CHECK1-NEXT: [[TMP59:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK1-NEXT: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[TMP59]]) -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 -// CHECK1-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[I6:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK1-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 -// CHECK1-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 -// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 -// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* -// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK1-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK1-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK1-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK1: omp.precond.then: -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK1-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK1-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP5]] to i32 -// CHECK1-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK1-NEXT: [[CONV5:%.*]] = trunc i64 [[TMP6]] to i32 -// CHECK1-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: store i32 [[CONV5]], i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) -// CHECK1-NEXT: br label [[OMP_DISPATCH_COND:%.*]] -// CHECK1: omp.dispatch.cond: -// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: [[CONV7:%.*]] = sext i32 [[TMP9]] to i64 -// CHECK1-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK1-NEXT: [[CMP8:%.*]] = icmp ugt i64 [[CONV7]], [[TMP10]] -// CHECK1-NEXT: br i1 [[CMP8]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK1: cond.true: -// CHECK1-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK1-NEXT: br label [[COND_END:%.*]] -// CHECK1: cond.false: -// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: [[CONV9:%.*]] = sext i32 [[TMP12]] to i64 -// CHECK1-NEXT: br label [[COND_END]] -// CHECK1: cond.end: -// CHECK1-NEXT: [[COND:%.*]] = phi i64 [ [[TMP11]], [[COND_TRUE]] ], [ [[CONV9]], [[COND_FALSE]] ] -// CHECK1-NEXT: [[CONV10:%.*]] = trunc i64 [[COND]] to i32 -// CHECK1-NEXT: store i32 [[CONV10]], i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: [[CMP11:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] -// CHECK1-NEXT: br i1 [[CMP11]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] -// CHECK1: omp.dispatch.body: -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK1: omp.inner.for.cond: -// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: [[CMP12:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] -// CHECK1-NEXT: br i1 [[CMP12]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK1: omp.inner.for.body: -// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 -// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK1-NEXT: store i32 [[ADD]], i32* [[I6]], align 4 -// CHECK1-NEXT: [[TMP19:%.*]] = load i32, i32* [[I6]], align 4 -// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64 -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM]] -// CHECK1-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 -// CHECK1-NEXT: [[TMP20:%.*]] = load i32, i32* [[I6]], align 4 -// CHECK1-NEXT: store i32 [[TMP20]], i32* [[CONV1]], align 8 -// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK1: omp.body.continue: -// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK1: omp.inner.for.inc: -// CHECK1-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[ADD13:%.*]] = add nsw i32 [[TMP21]], 1 -// CHECK1-NEXT: store i32 [[ADD13]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP12:![0-9]+]] -// CHECK1: omp.inner.for.end: -// CHECK1-NEXT: br label [[OMP_DISPATCH_INC:%.*]] -// CHECK1: omp.dispatch.inc: -// CHECK1-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK1-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD15:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] -// CHECK1-NEXT: store i32 [[ADD15]], i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: br label [[OMP_DISPATCH_COND]] -// CHECK1: omp.dispatch.end: -// CHECK1-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 -// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) -// CHECK1-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 -// CHECK1-NEXT: br i1 [[TMP29]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK1: .omp.final.then: -// CHECK1-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[SUB16:%.*]] = sub nsw i32 [[TMP30]], 0 -// CHECK1-NEXT: [[DIV17:%.*]] = sdiv i32 [[SUB16]], 1 -// CHECK1-NEXT: [[MUL18:%.*]] = mul nsw i32 [[DIV17]], 1 -// CHECK1-NEXT: [[ADD19:%.*]] = add nsw i32 0, [[MUL18]] -// CHECK1-NEXT: store i32 [[ADD19]], i32* [[I6]], align 4 -// CHECK1-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK1: .omp.final.done: -// CHECK1-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0 -// CHECK1-NEXT: br i1 [[TMP32]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK1: .omp.lastprivate.then: -// CHECK1-NEXT: [[TMP33:%.*]] = load i32, i32* [[CONV1]], align 8 -// CHECK1-NEXT: store i32 [[TMP33]], i32* [[CONV1]], align 8 -// CHECK1-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK1: .omp.lastprivate.done: -// CHECK1-NEXT: br label [[OMP_PRECOND_END]] -// CHECK1: omp.precond.end: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44 -// CHECK1-SAME: (i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 -// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK1-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK1: .execute: -// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK1-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 -// CHECK1-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK1-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] -// CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK1: .omp.deinit: -// CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK1-NEXT: br label [[DOTEXIT:%.*]] -// CHECK1: .exit: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__2 -// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 -// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 -// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK1-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK1-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK1-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK1-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK1: omp.precond.then: -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK1-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 -// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK1-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] -// CHECK1-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK1: cond.true: -// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK1-NEXT: br label [[COND_END:%.*]] -// CHECK1: cond.false: -// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: br label [[COND_END]] -// CHECK1: cond.end: -// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] -// CHECK1-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK1: omp.inner.for.cond: -// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 -// CHECK1-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] -// CHECK1-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK1: omp.inner.for.body: -// CHECK1-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64 -// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 -// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: [[CONV6:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK1-NEXT: store i32 [[TMP18]], i32* [[CONV6]], align 4 -// CHECK1-NEXT: [[TMP19:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK1-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP21:%.*]] = inttoptr i64 [[TMP15]] to i8* -// CHECK1-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 8 -// CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP23:%.*]] = inttoptr i64 [[TMP17]] to i8* -// CHECK1-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 8 -// CHECK1-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK1-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP19]] to i8* -// CHECK1-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 8 -// CHECK1-NEXT: [[TMP26:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 -// CHECK1-NEXT: [[TMP27:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* -// CHECK1-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 8 -// CHECK1-NEXT: [[TMP28:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP29:%.*]] = load i32, i32* [[TMP28]], align 4 -// CHECK1-NEXT: [[TMP30:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP29]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP30]], i64 4) -// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK1: omp.inner.for.inc: -// CHECK1-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] -// CHECK1-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] -// CHECK1-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] -// CHECK1-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK1-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP37]], [[TMP38]] -// CHECK1-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] -// CHECK1: cond.true11: -// CHECK1-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK1-NEXT: br label [[COND_END13:%.*]] -// CHECK1: cond.false12: -// CHECK1-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: br label [[COND_END13]] -// CHECK1: cond.end13: -// CHECK1-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP39]], [[COND_TRUE11]] ], [ [[TMP40]], [[COND_FALSE12]] ] -// CHECK1-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: store i32 [[TMP41]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]] -// CHECK1: omp.inner.for.end: -// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK1: omp.loop.exit: -// CHECK1-NEXT: [[TMP42:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP43:%.*]] = load i32, i32* [[TMP42]], align 4 -// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP43]]) -// CHECK1-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[TMP45:%.*]] = icmp ne i32 [[TMP44]], 0 -// CHECK1-NEXT: br i1 [[TMP45]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK1: .omp.final.then: -// CHECK1-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP46]], 0 -// CHECK1-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1 -// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV16]], 1 -// CHECK1-NEXT: [[ADD17:%.*]] = add nsw i32 0, [[MUL]] -// CHECK1-NEXT: store i32 [[ADD17]], i32* [[I3]], align 4 -// CHECK1-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK1: .omp.final.done: -// CHECK1-NEXT: br label [[OMP_PRECOND_END]] -// CHECK1: omp.precond.end: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__3 -// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 -// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[I5:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK1-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK1-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK1-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK1-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK1: omp.precond.then: -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK1-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK1-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP5]] to i32 -// CHECK1-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK1-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP6]] to i32 -// CHECK1-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK1: omp.inner.for.cond: -// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[CONV6:%.*]] = sext i32 [[TMP10]] to i64 -// CHECK1-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK1-NEXT: [[CMP7:%.*]] = icmp ule i64 [[CONV6]], [[TMP11]] -// CHECK1-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK1: omp.inner.for.body: -// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 -// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK1-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 -// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[I5]], align 4 -// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP13]] to i64 -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i64 0, i64 [[IDXPROM]] -// CHECK1-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 -// CHECK1-NEXT: [[CONV8:%.*]] = sext i16 [[TMP14]] to i32 -// CHECK1-NEXT: [[ADD9:%.*]] = add nsw i32 [[CONV8]], 1 -// CHECK1-NEXT: [[CONV10:%.*]] = trunc i32 [[ADD9]] to i16 -// CHECK1-NEXT: store i16 [[CONV10]], i16* [[ARRAYIDX]], align 2 -// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK1: omp.body.continue: -// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK1: omp.inner.for.inc: -// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] -// CHECK1-NEXT: store i32 [[ADD11]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP14:![0-9]+]] -// CHECK1: omp.inner.for.end: -// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK1: omp.loop.exit: -// CHECK1-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 -// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) -// CHECK1-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 -// CHECK1-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK1: .omp.final.then: -// CHECK1-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[SUB12:%.*]] = sub nsw i32 [[TMP21]], 0 -// CHECK1-NEXT: [[DIV13:%.*]] = sdiv i32 [[SUB12]], 1 -// CHECK1-NEXT: [[MUL14:%.*]] = mul nsw i32 [[DIV13]], 1 -// CHECK1-NEXT: [[ADD15:%.*]] = add nsw i32 0, [[MUL14]] -// CHECK1-NEXT: store i32 [[ADD15]], i32* [[I5]], align 4 -// CHECK1-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK1: .omp.final.done: -// CHECK1-NEXT: br label [[OMP_PRECOND_END]] -// CHECK1: omp.precond.end: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 -// CHECK1-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 -// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK1-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 -// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK1: .execute: -// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK1-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] -// CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK1: .omp.deinit: -// CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK1-NEXT: br label [[DOTEXIT:%.*]] -// CHECK1: .exit: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__4 -// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 -// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 8 -// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 -// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 -// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK1: cond.true: -// CHECK1-NEXT: br label [[COND_END:%.*]] -// CHECK1: cond.false: -// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: br label [[COND_END]] -// CHECK1: cond.end: -// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] -// CHECK1-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK1: omp.inner.for.cond: -// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 -// CHECK1-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK1: omp.inner.for.body: -// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 -// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 -// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP8]] to i8* -// CHECK1-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 8 -// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP10]] to i8* -// CHECK1-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 -// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK1-NEXT: [[TMP16:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* -// CHECK1-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 -// CHECK1-NEXT: [[TMP17:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP17]], i64 3) -// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK1: omp.inner.for.inc: -// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] -// CHECK1-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] -// CHECK1-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK1-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP24]], 9 -// CHECK1-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] -// CHECK1: cond.true5: -// CHECK1-NEXT: br label [[COND_END7:%.*]] -// CHECK1: cond.false6: -// CHECK1-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: br label [[COND_END7]] -// CHECK1: cond.end7: -// CHECK1-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP25]], [[COND_FALSE6]] ] -// CHECK1-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: store i32 [[TMP26]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP15:![0-9]+]] -// CHECK1: omp.inner.for.end: -// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK1: omp.loop.exit: -// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) -// CHECK1-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[TMP28:%.*]] = icmp ne i32 [[TMP27]], 0 -// CHECK1-NEXT: br i1 [[TMP28]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK1: .omp.final.then: -// CHECK1-NEXT: store i32 10, i32* [[I]], align 4 -// CHECK1-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK1: .omp.final.done: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__5 -// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 -// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK1-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK1-NEXT: [[CONV:%.*]] = trunc i64 [[TMP1]] to i32 -// CHECK1-NEXT: [[TMP2:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK1-NEXT: [[CONV1:%.*]] = trunc i64 [[TMP2]] to i32 -// CHECK1-NEXT: store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: store i32 [[CONV1]], i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 -// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK1: omp.inner.for.cond: -// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[CONV2:%.*]] = sext i32 [[TMP6]] to i64 -// CHECK1-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK1-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV2]], [[TMP7]] -// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK1: omp.inner.for.body: -// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 -// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK1-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 -// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP9]] to i64 -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM]] -// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -// CHECK1-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1 -// CHECK1-NEXT: store i32 [[ADD3]], i32* [[ARRAYIDX]], align 4 -// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK1: omp.body.continue: -// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK1: omp.inner.for.inc: -// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] -// CHECK1-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]] -// CHECK1: omp.inner.for.end: -// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK1: omp.loop.exit: -// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) -// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 -// CHECK1-NEXT: br i1 [[TMP14]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK1: .omp.final.then: -// CHECK1-NEXT: store i32 10, i32* [[I]], align 4 -// CHECK1-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK1: .omp.final.done: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 -// CHECK1-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 -// CHECK1-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK1-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK1-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* -// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK1: .execute: -// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[F_CASTED]] to i32* -// CHECK1-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 -// CHECK1-NEXT: [[TMP3:%.*]] = load i64, i64* [[F_CASTED]], align 8 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK1-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i64 [[TMP3]]) #[[ATTR2]] -// CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK1: .omp.deinit: -// CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK1-NEXT: br label [[DOTEXIT:%.*]] -// CHECK1: .exit: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__6 -// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 -// CHECK1-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[K:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 -// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK1-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 -// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 -// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK1: cond.true: -// CHECK1-NEXT: br label [[COND_END:%.*]] -// CHECK1: cond.false: -// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: br label [[COND_END]] -// CHECK1: cond.end: -// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] -// CHECK1-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK1: omp.inner.for.cond: -// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 -// CHECK1-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK1: omp.inner.for.body: -// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 -// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 -// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: [[CONV3:%.*]] = bitcast i64* [[F_CASTED]] to i32* -// CHECK1-NEXT: store i32 [[TMP11]], i32* [[CONV3]], align 4 -// CHECK1-NEXT: [[TMP12:%.*]] = load i64, i64* [[F_CASTED]], align 8 -// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP8]] to i8* -// CHECK1-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 -// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP16:%.*]] = inttoptr i64 [[TMP10]] to i8* -// CHECK1-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 -// CHECK1-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK1-NEXT: [[TMP18:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* -// CHECK1-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 8 -// CHECK1-NEXT: [[TMP19:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 -// CHECK1-NEXT: [[TMP20:%.*]] = inttoptr i64 [[TMP12]] to i8* -// CHECK1-NEXT: store i8* [[TMP20]], i8** [[TMP19]], align 8 -// CHECK1-NEXT: [[TMP21:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, [10 x [10 x i32]]*, i64)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP21]], i64 4) -// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK1: omp.inner.for.inc: -// CHECK1-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK1-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] -// CHECK1-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP26]], [[TMP27]] -// CHECK1-NEXT: store i32 [[ADD5]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP28]], 99 -// CHECK1-NEXT: br i1 [[CMP6]], label [[COND_TRUE7:%.*]], label [[COND_FALSE8:%.*]] -// CHECK1: cond.true7: -// CHECK1-NEXT: br label [[COND_END9:%.*]] -// CHECK1: cond.false8: -// CHECK1-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: br label [[COND_END9]] -// CHECK1: cond.end9: -// CHECK1-NEXT: [[COND10:%.*]] = phi i32 [ 99, [[COND_TRUE7]] ], [ [[TMP29]], [[COND_FALSE8]] ] -// CHECK1-NEXT: store i32 [[COND10]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: store i32 [[TMP30]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]] -// CHECK1: omp.inner.for.end: -// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK1: omp.loop.exit: -// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) -// CHECK1-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0 -// CHECK1-NEXT: br i1 [[TMP32]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK1: .omp.final.then: -// CHECK1-NEXT: store i32 10, i32* [[I]], align 4 -// CHECK1-NEXT: store i32 10, i32* [[J]], align 4 -// CHECK1-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK1: .omp.final.done: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__7 -// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 -// CHECK1-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[K:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK1-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK1-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK1-NEXT: [[CONV2:%.*]] = trunc i64 [[TMP1]] to i32 -// CHECK1-NEXT: [[TMP2:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK1-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP2]] to i32 -// CHECK1-NEXT: store i32 [[CONV2]], i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 -// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK1: omp.inner.for.cond: -// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[CONV4:%.*]] = sext i32 [[TMP6]] to i64 -// CHECK1-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK1-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV4]], [[TMP7]] -// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK1: omp.inner.for.body: -// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 -// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 -// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK1-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[DIV5:%.*]] = sdiv i32 [[TMP10]], 10 -// CHECK1-NEXT: [[MUL6:%.*]] = mul nsw i32 [[DIV5]], 10 -// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL6]] -// CHECK1-NEXT: [[MUL7:%.*]] = mul nsw i32 [[SUB]], 1 -// CHECK1-NEXT: [[ADD8:%.*]] = add nsw i32 0, [[MUL7]] -// CHECK1-NEXT: store i32 [[ADD8]], i32* [[J]], align 4 -// CHECK1-NEXT: store i32 10, i32* [[K]], align 4 -// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 -// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 -// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: [[MUL9:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] -// CHECK1-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP11]], [[MUL9]] -// CHECK1-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 -// CHECK1-NEXT: [[ADD11:%.*]] = add nsw i32 [[ADD10]], [[TMP14]] -// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 -// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP15]] to i64 -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i64 0, i64 [[IDXPROM]] -// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 -// CHECK1-NEXT: [[IDXPROM12:%.*]] = sext i32 [[TMP16]] to i64 -// CHECK1-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i64 0, i64 [[IDXPROM12]] -// CHECK1-NEXT: store i32 [[ADD11]], i32* [[ARRAYIDX13]], align 4 -// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK1: omp.body.continue: -// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK1: omp.inner.for.inc: -// CHECK1-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] -// CHECK1-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP18:![0-9]+]] -// CHECK1: omp.inner.for.end: -// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK1: omp.loop.exit: -// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) -// CHECK1-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 -// CHECK1-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK1: .omp.final.then: -// CHECK1-NEXT: store i32 10, i32* [[I]], align 4 -// CHECK1-NEXT: store i32 10, i32* [[J]], align 4 -// CHECK1-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK1: .omp.final.done: -// CHECK1-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38 -// CHECK2-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0:[0-9]+]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 -// CHECK2-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[L_CASTED:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK2-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 -// CHECK2-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 -// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 -// CHECK2-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK2: .execute: -// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) -// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: [[CONV2:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK2-NEXT: store i32 [[TMP2]], i32* [[CONV2]], align 4 -// CHECK2-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[CONV1]], align 8 -// CHECK2-NEXT: [[CONV3:%.*]] = bitcast i64* [[L_CASTED]] to i32* -// CHECK2-NEXT: store i32 [[TMP4]], i32* [[CONV3]], align 4 -// CHECK2-NEXT: [[TMP5:%.*]] = load i64, i64* [[L_CASTED]], align 8 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK2-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i32]* [[TMP0]], i64 [[TMP5]]) #[[ATTR2:[0-9]+]] -// CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK2: .omp.deinit: -// CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK2-NEXT: br label [[DOTEXIT:%.*]] -// CHECK2: .exit: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 -// CHECK2-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[I5:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[L_CASTED:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 8 -// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK2-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 -// CHECK2-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 -// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 -// CHECK2-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* -// CHECK2-NEXT: [[TMP1:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i64 4, i16 1) -// CHECK2-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to %struct._globalized_locals_ty* -// CHECK2-NEXT: [[L2:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP2]], i32 0, i32 0 -// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: store i32 [[TMP3]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP4]], 0 -// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK2-NEXT: [[SUB4:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK2-NEXT: store i32 [[SUB4]], i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK2-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] -// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK2: omp.precond.then: -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK2-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP8]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) -// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK2-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]] -// CHECK2-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK2: cond.true: -// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK2-NEXT: br label [[COND_END:%.*]] -// CHECK2: cond.false: -// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: br label [[COND_END]] -// CHECK2: cond.end: -// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] -// CHECK2-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK2: omp.inner.for.cond: -// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP15]], 1 -// CHECK2-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP14]], [[ADD]] -// CHECK2-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK2: omp.inner.for.body: -// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 -// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP19:%.*]] = zext i32 [[TMP18]] to i64 -// CHECK2-NEXT: [[TMP20:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: [[CONV8:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK2-NEXT: store i32 [[TMP20]], i32* [[CONV8]], align 4 -// CHECK2-NEXT: [[TMP21:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK2-NEXT: [[TMP22:%.*]] = load i32, i32* [[CONV1]], align 8 -// CHECK2-NEXT: [[CONV9:%.*]] = bitcast i64* [[L_CASTED]] to i32* -// CHECK2-NEXT: store i32 [[TMP22]], i32* [[CONV9]], align 4 -// CHECK2-NEXT: [[TMP23:%.*]] = load i64, i64* [[L_CASTED]], align 8 -// CHECK2-NEXT: [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP17]] to i8* -// CHECK2-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 8 -// CHECK2-NEXT: [[TMP26:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP27:%.*]] = inttoptr i64 [[TMP19]] to i8* -// CHECK2-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 8 -// CHECK2-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK2-NEXT: [[TMP29:%.*]] = inttoptr i64 [[TMP21]] to i8* -// CHECK2-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 8 -// CHECK2-NEXT: [[TMP30:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 -// CHECK2-NEXT: [[TMP31:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* -// CHECK2-NEXT: store i8* [[TMP31]], i8** [[TMP30]], align 8 -// CHECK2-NEXT: [[TMP32:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 4 -// CHECK2-NEXT: [[TMP33:%.*]] = inttoptr i64 [[TMP23]] to i8* -// CHECK2-NEXT: store i8* [[TMP33]], i8** [[TMP32]], align 8 -// CHECK2-NEXT: [[TMP34:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP35:%.*]] = load i32, i32* [[TMP34]], align 4 -// CHECK2-NEXT: [[TMP36:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP35]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i32]*, i64)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP36]], i64 5) -// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK2: omp.inner.for.inc: -// CHECK2-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP37]], [[TMP38]] -// CHECK2-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP39]], [[TMP40]] -// CHECK2-NEXT: store i32 [[ADD11]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD12:%.*]] = add nsw i32 [[TMP41]], [[TMP42]] -// CHECK2-NEXT: store i32 [[ADD12]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK2-NEXT: [[CMP13:%.*]] = icmp sgt i32 [[TMP43]], [[TMP44]] -// CHECK2-NEXT: br i1 [[CMP13]], label [[COND_TRUE14:%.*]], label [[COND_FALSE15:%.*]] -// CHECK2: cond.true14: -// CHECK2-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK2-NEXT: br label [[COND_END16:%.*]] -// CHECK2: cond.false15: -// CHECK2-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: br label [[COND_END16]] -// CHECK2: cond.end16: -// CHECK2-NEXT: [[COND17:%.*]] = phi i32 [ [[TMP45]], [[COND_TRUE14]] ], [ [[TMP46]], [[COND_FALSE15]] ] -// CHECK2-NEXT: store i32 [[COND17]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: store i32 [[TMP47]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP10:![0-9]+]] -// CHECK2: omp.inner.for.end: -// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK2: omp.loop.exit: -// CHECK2-NEXT: [[TMP48:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP49:%.*]] = load i32, i32* [[TMP48]], align 4 -// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP49]]) -// CHECK2-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[TMP51:%.*]] = icmp ne i32 [[TMP50]], 0 -// CHECK2-NEXT: br i1 [[TMP51]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK2: .omp.final.then: -// CHECK2-NEXT: [[TMP52:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[SUB18:%.*]] = sub nsw i32 [[TMP52]], 0 -// CHECK2-NEXT: [[DIV19:%.*]] = sdiv i32 [[SUB18]], 1 -// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV19]], 1 -// CHECK2-NEXT: [[ADD20:%.*]] = add nsw i32 0, [[MUL]] -// CHECK2-NEXT: store i32 [[ADD20]], i32* [[I5]], align 4 -// CHECK2-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK2: .omp.final.done: -// CHECK2-NEXT: [[TMP53:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[TMP54:%.*]] = icmp ne i32 [[TMP53]], 0 -// CHECK2-NEXT: br i1 [[TMP54]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK2: .omp.lastprivate.then: -// CHECK2-NEXT: [[TMP55:%.*]] = load i32, i32* [[CONV1]], align 8 -// CHECK2-NEXT: store i32 [[TMP55]], i32* [[CONV1]], align 8 -// CHECK2-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK2: .omp.lastprivate.done: -// CHECK2-NEXT: br label [[OMP_PRECOND_END]] -// CHECK2: omp.precond.end: -// CHECK2-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP1]]) -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 -// CHECK2-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[I6:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK2-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK2-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 -// CHECK2-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 -// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 -// CHECK2-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* -// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK2-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK2-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK2-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK2: omp.precond.then: -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK2-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK2-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP5]] to i32 -// CHECK2-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK2-NEXT: [[CONV5:%.*]] = trunc i64 [[TMP6]] to i32 -// CHECK2-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_LB]], align 4 -// CHECK2-NEXT: store i32 [[CONV5]], i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) -// CHECK2-NEXT: br label [[OMP_DISPATCH_COND:%.*]] -// CHECK2: omp.dispatch.cond: -// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: [[CONV7:%.*]] = sext i32 [[TMP9]] to i64 -// CHECK2-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK2-NEXT: [[CMP8:%.*]] = icmp ugt i64 [[CONV7]], [[TMP10]] -// CHECK2-NEXT: br i1 [[CMP8]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK2: cond.true: -// CHECK2-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK2-NEXT: br label [[COND_END:%.*]] -// CHECK2: cond.false: -// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: [[CONV9:%.*]] = sext i32 [[TMP12]] to i64 -// CHECK2-NEXT: br label [[COND_END]] -// CHECK2: cond.end: -// CHECK2-NEXT: [[COND:%.*]] = phi i64 [ [[TMP11]], [[COND_TRUE]] ], [ [[CONV9]], [[COND_FALSE]] ] -// CHECK2-NEXT: [[CONV10:%.*]] = trunc i64 [[COND]] to i32 -// CHECK2-NEXT: store i32 [[CONV10]], i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK2-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: [[CMP11:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] -// CHECK2-NEXT: br i1 [[CMP11]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] -// CHECK2: omp.dispatch.body: -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK2: omp.inner.for.cond: -// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: [[CMP12:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] -// CHECK2-NEXT: br i1 [[CMP12]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK2: omp.inner.for.body: -// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 -// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK2-NEXT: store i32 [[ADD]], i32* [[I6]], align 4 -// CHECK2-NEXT: [[TMP19:%.*]] = load i32, i32* [[I6]], align 4 -// CHECK2-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64 -// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM]] -// CHECK2-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 -// CHECK2-NEXT: [[TMP20:%.*]] = load i32, i32* [[I6]], align 4 -// CHECK2-NEXT: store i32 [[TMP20]], i32* [[CONV1]], align 8 -// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK2: omp.body.continue: -// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK2: omp.inner.for.inc: -// CHECK2-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[ADD13:%.*]] = add nsw i32 [[TMP21]], 1 -// CHECK2-NEXT: store i32 [[ADD13]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP12:![0-9]+]] -// CHECK2: omp.inner.for.end: -// CHECK2-NEXT: br label [[OMP_DISPATCH_INC:%.*]] -// CHECK2: omp.dispatch.inc: -// CHECK2-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK2-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK2-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_LB]], align 4 -// CHECK2-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD15:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] -// CHECK2-NEXT: store i32 [[ADD15]], i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: br label [[OMP_DISPATCH_COND]] -// CHECK2: omp.dispatch.end: -// CHECK2-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 -// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) -// CHECK2-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 -// CHECK2-NEXT: br i1 [[TMP29]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK2: .omp.final.then: -// CHECK2-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[SUB16:%.*]] = sub nsw i32 [[TMP30]], 0 -// CHECK2-NEXT: [[DIV17:%.*]] = sdiv i32 [[SUB16]], 1 -// CHECK2-NEXT: [[MUL18:%.*]] = mul nsw i32 [[DIV17]], 1 -// CHECK2-NEXT: [[ADD19:%.*]] = add nsw i32 0, [[MUL18]] -// CHECK2-NEXT: store i32 [[ADD19]], i32* [[I6]], align 4 -// CHECK2-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK2: .omp.final.done: -// CHECK2-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0 -// CHECK2-NEXT: br i1 [[TMP32]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK2: .omp.lastprivate.then: -// CHECK2-NEXT: [[TMP33:%.*]] = load i32, i32* [[CONV1]], align 8 -// CHECK2-NEXT: store i32 [[TMP33]], i32* [[CONV1]], align 8 -// CHECK2-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK2: .omp.lastprivate.done: -// CHECK2-NEXT: br label [[OMP_PRECOND_END]] -// CHECK2: omp.precond.end: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44 -// CHECK2-SAME: (i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 -// CHECK2-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK2-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK2: .execute: -// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK2-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 -// CHECK2-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK2-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] -// CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK2: .omp.deinit: -// CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK2-NEXT: br label [[DOTEXIT:%.*]] -// CHECK2: .exit: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__2 -// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 -// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 -// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK2-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK2-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK2-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK2-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK2: omp.precond.then: -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK2-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 -// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK2-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] -// CHECK2-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK2: cond.true: -// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK2-NEXT: br label [[COND_END:%.*]] -// CHECK2: cond.false: -// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: br label [[COND_END]] -// CHECK2: cond.end: -// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] -// CHECK2-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK2: omp.inner.for.cond: -// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 -// CHECK2-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] -// CHECK2-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK2: omp.inner.for.body: -// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64 -// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 -// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: [[CONV6:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK2-NEXT: store i32 [[TMP18]], i32* [[CONV6]], align 4 -// CHECK2-NEXT: [[TMP19:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK2-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP21:%.*]] = inttoptr i64 [[TMP15]] to i8* -// CHECK2-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 8 -// CHECK2-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP23:%.*]] = inttoptr i64 [[TMP17]] to i8* -// CHECK2-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 8 -// CHECK2-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK2-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP19]] to i8* -// CHECK2-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 8 -// CHECK2-NEXT: [[TMP26:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 -// CHECK2-NEXT: [[TMP27:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* -// CHECK2-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 8 -// CHECK2-NEXT: [[TMP28:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP29:%.*]] = load i32, i32* [[TMP28]], align 4 -// CHECK2-NEXT: [[TMP30:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP29]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP30]], i64 4) -// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK2: omp.inner.for.inc: -// CHECK2-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] -// CHECK2-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] -// CHECK2-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] -// CHECK2-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK2-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP37]], [[TMP38]] -// CHECK2-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] -// CHECK2: cond.true11: -// CHECK2-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK2-NEXT: br label [[COND_END13:%.*]] -// CHECK2: cond.false12: -// CHECK2-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: br label [[COND_END13]] -// CHECK2: cond.end13: -// CHECK2-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP39]], [[COND_TRUE11]] ], [ [[TMP40]], [[COND_FALSE12]] ] -// CHECK2-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: store i32 [[TMP41]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]] -// CHECK2: omp.inner.for.end: -// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK2: omp.loop.exit: -// CHECK2-NEXT: [[TMP42:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP43:%.*]] = load i32, i32* [[TMP42]], align 4 -// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP43]]) -// CHECK2-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[TMP45:%.*]] = icmp ne i32 [[TMP44]], 0 -// CHECK2-NEXT: br i1 [[TMP45]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK2: .omp.final.then: -// CHECK2-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP46]], 0 -// CHECK2-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1 -// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV16]], 1 -// CHECK2-NEXT: [[ADD17:%.*]] = add nsw i32 0, [[MUL]] -// CHECK2-NEXT: store i32 [[ADD17]], i32* [[I3]], align 4 -// CHECK2-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK2: .omp.final.done: -// CHECK2-NEXT: br label [[OMP_PRECOND_END]] -// CHECK2: omp.precond.end: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__3 -// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 -// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[I5:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK2-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK2-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK2-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK2-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK2-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK2: omp.precond.then: -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK2-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK2-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP5]] to i32 -// CHECK2-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK2-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP6]] to i32 -// CHECK2-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_LB]], align 4 -// CHECK2-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK2-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK2: omp.inner.for.cond: -// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[CONV6:%.*]] = sext i32 [[TMP10]] to i64 -// CHECK2-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK2-NEXT: [[CMP7:%.*]] = icmp ule i64 [[CONV6]], [[TMP11]] -// CHECK2-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK2: omp.inner.for.body: -// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 -// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK2-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 -// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[I5]], align 4 -// CHECK2-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP13]] to i64 -// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i64 0, i64 [[IDXPROM]] -// CHECK2-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 -// CHECK2-NEXT: [[CONV8:%.*]] = sext i16 [[TMP14]] to i32 -// CHECK2-NEXT: [[ADD9:%.*]] = add nsw i32 [[CONV8]], 1 -// CHECK2-NEXT: [[CONV10:%.*]] = trunc i32 [[ADD9]] to i16 -// CHECK2-NEXT: store i16 [[CONV10]], i16* [[ARRAYIDX]], align 2 -// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK2: omp.body.continue: -// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK2: omp.inner.for.inc: -// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] -// CHECK2-NEXT: store i32 [[ADD11]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP14:![0-9]+]] -// CHECK2: omp.inner.for.end: -// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK2: omp.loop.exit: -// CHECK2-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 -// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) -// CHECK2-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 -// CHECK2-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK2: .omp.final.then: -// CHECK2-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[SUB12:%.*]] = sub nsw i32 [[TMP21]], 0 -// CHECK2-NEXT: [[DIV13:%.*]] = sdiv i32 [[SUB12]], 1 -// CHECK2-NEXT: [[MUL14:%.*]] = mul nsw i32 [[DIV13]], 1 -// CHECK2-NEXT: [[ADD15:%.*]] = add nsw i32 0, [[MUL14]] -// CHECK2-NEXT: store i32 [[ADD15]], i32* [[I5]], align 4 -// CHECK2-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK2: .omp.final.done: -// CHECK2-NEXT: br label [[OMP_PRECOND_END]] -// CHECK2: omp.precond.end: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 -// CHECK2-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 -// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK2-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 -// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK2: .execute: -// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK2-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] -// CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK2: .omp.deinit: -// CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK2-NEXT: br label [[DOTEXIT:%.*]] -// CHECK2: .exit: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__4 -// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 -// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 8 -// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 -// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 -// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 -// CHECK2-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK2: cond.true: -// CHECK2-NEXT: br label [[COND_END:%.*]] -// CHECK2: cond.false: -// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: br label [[COND_END]] -// CHECK2: cond.end: -// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] -// CHECK2-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK2: omp.inner.for.cond: -// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 -// CHECK2-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK2: omp.inner.for.body: -// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 -// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 -// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP8]] to i8* -// CHECK2-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 8 -// CHECK2-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP10]] to i8* -// CHECK2-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 -// CHECK2-NEXT: [[TMP15:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK2-NEXT: [[TMP16:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* -// CHECK2-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 -// CHECK2-NEXT: [[TMP17:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP17]], i64 3) -// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK2: omp.inner.for.inc: -// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] -// CHECK2-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] -// CHECK2-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK2-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP24]], 9 -// CHECK2-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] -// CHECK2: cond.true5: -// CHECK2-NEXT: br label [[COND_END7:%.*]] -// CHECK2: cond.false6: -// CHECK2-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: br label [[COND_END7]] -// CHECK2: cond.end7: -// CHECK2-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP25]], [[COND_FALSE6]] ] -// CHECK2-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: store i32 [[TMP26]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP15:![0-9]+]] -// CHECK2: omp.inner.for.end: -// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK2: omp.loop.exit: -// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) -// CHECK2-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[TMP28:%.*]] = icmp ne i32 [[TMP27]], 0 -// CHECK2-NEXT: br i1 [[TMP28]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK2: .omp.final.then: -// CHECK2-NEXT: store i32 10, i32* [[I]], align 4 -// CHECK2-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK2: .omp.final.done: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__5 -// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 -// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK2-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK2-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 -// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK2-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK2-NEXT: [[CONV:%.*]] = trunc i64 [[TMP1]] to i32 -// CHECK2-NEXT: [[TMP2:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK2-NEXT: [[CONV1:%.*]] = trunc i64 [[TMP2]] to i32 -// CHECK2-NEXT: store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4 -// CHECK2-NEXT: store i32 [[CONV1]], i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 -// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK2-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK2: omp.inner.for.cond: -// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[CONV2:%.*]] = sext i32 [[TMP6]] to i64 -// CHECK2-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK2-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV2]], [[TMP7]] -// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK2: omp.inner.for.body: -// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 -// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK2-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 -// CHECK2-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP9]] to i64 -// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM]] -// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -// CHECK2-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1 -// CHECK2-NEXT: store i32 [[ADD3]], i32* [[ARRAYIDX]], align 4 -// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK2: omp.body.continue: -// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK2: omp.inner.for.inc: -// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] -// CHECK2-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]] -// CHECK2: omp.inner.for.end: -// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK2: omp.loop.exit: -// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) -// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 -// CHECK2-NEXT: br i1 [[TMP14]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK2: .omp.final.then: -// CHECK2-NEXT: store i32 10, i32* [[I]], align 4 -// CHECK2-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK2: .omp.final.done: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 -// CHECK2-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 -// CHECK2-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK2-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK2-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 -// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK2: .execute: -// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: [[CONV1:%.*]] = bitcast i64* [[F_CASTED]] to i32* -// CHECK2-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 -// CHECK2-NEXT: [[TMP3:%.*]] = load i64, i64* [[F_CASTED]], align 8 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK2-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i64 [[TMP3]]) #[[ATTR2]] -// CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK2: .omp.deinit: -// CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK2-NEXT: br label [[DOTEXIT:%.*]] -// CHECK2: .exit: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__6 -// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 -// CHECK2-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[K:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 -// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK2-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 -// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 -// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 -// CHECK2-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK2: cond.true: -// CHECK2-NEXT: br label [[COND_END:%.*]] -// CHECK2: cond.false: -// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: br label [[COND_END]] -// CHECK2: cond.end: -// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] -// CHECK2-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK2: omp.inner.for.cond: -// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 -// CHECK2-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK2: omp.inner.for.body: -// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 -// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 -// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: [[CONV3:%.*]] = bitcast i64* [[F_CASTED]] to i32* -// CHECK2-NEXT: store i32 [[TMP11]], i32* [[CONV3]], align 4 -// CHECK2-NEXT: [[TMP12:%.*]] = load i64, i64* [[F_CASTED]], align 8 -// CHECK2-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP8]] to i8* -// CHECK2-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 -// CHECK2-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP16:%.*]] = inttoptr i64 [[TMP10]] to i8* -// CHECK2-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 -// CHECK2-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK2-NEXT: [[TMP18:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* -// CHECK2-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 8 -// CHECK2-NEXT: [[TMP19:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 -// CHECK2-NEXT: [[TMP20:%.*]] = inttoptr i64 [[TMP12]] to i8* -// CHECK2-NEXT: store i8* [[TMP20]], i8** [[TMP19]], align 8 -// CHECK2-NEXT: [[TMP21:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, [10 x [10 x i32]]*, i64)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP21]], i64 4) -// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK2: omp.inner.for.inc: -// CHECK2-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK2-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] -// CHECK2-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP26]], [[TMP27]] -// CHECK2-NEXT: store i32 [[ADD5]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP28]], 99 -// CHECK2-NEXT: br i1 [[CMP6]], label [[COND_TRUE7:%.*]], label [[COND_FALSE8:%.*]] -// CHECK2: cond.true7: -// CHECK2-NEXT: br label [[COND_END9:%.*]] -// CHECK2: cond.false8: -// CHECK2-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: br label [[COND_END9]] -// CHECK2: cond.end9: -// CHECK2-NEXT: [[COND10:%.*]] = phi i32 [ 99, [[COND_TRUE7]] ], [ [[TMP29]], [[COND_FALSE8]] ] -// CHECK2-NEXT: store i32 [[COND10]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: store i32 [[TMP30]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]] -// CHECK2: omp.inner.for.end: -// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK2: omp.loop.exit: -// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) -// CHECK2-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0 -// CHECK2-NEXT: br i1 [[TMP32]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK2: .omp.final.then: -// CHECK2-NEXT: store i32 10, i32* [[I]], align 4 -// CHECK2-NEXT: store i32 10, i32* [[J]], align 4 -// CHECK2-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK2: .omp.final.done: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__7 -// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 -// CHECK2-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[K:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK2-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK2-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK2-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 -// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK2-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK2-NEXT: [[CONV2:%.*]] = trunc i64 [[TMP1]] to i32 -// CHECK2-NEXT: [[TMP2:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK2-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP2]] to i32 -// CHECK2-NEXT: store i32 [[CONV2]], i32* [[DOTOMP_LB]], align 4 -// CHECK2-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 -// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK2-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK2: omp.inner.for.cond: -// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[CONV4:%.*]] = sext i32 [[TMP6]] to i64 -// CHECK2-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK2-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV4]], [[TMP7]] -// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK2: omp.inner.for.body: -// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 -// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 -// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK2-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[DIV5:%.*]] = sdiv i32 [[TMP10]], 10 -// CHECK2-NEXT: [[MUL6:%.*]] = mul nsw i32 [[DIV5]], 10 -// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL6]] -// CHECK2-NEXT: [[MUL7:%.*]] = mul nsw i32 [[SUB]], 1 -// CHECK2-NEXT: [[ADD8:%.*]] = add nsw i32 0, [[MUL7]] -// CHECK2-NEXT: store i32 [[ADD8]], i32* [[J]], align 4 -// CHECK2-NEXT: store i32 10, i32* [[K]], align 4 -// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 -// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 -// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: [[MUL9:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] -// CHECK2-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP11]], [[MUL9]] -// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 -// CHECK2-NEXT: [[ADD11:%.*]] = add nsw i32 [[ADD10]], [[TMP14]] -// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 -// CHECK2-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP15]] to i64 -// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i64 0, i64 [[IDXPROM]] -// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 -// CHECK2-NEXT: [[IDXPROM12:%.*]] = sext i32 [[TMP16]] to i64 -// CHECK2-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i64 0, i64 [[IDXPROM12]] -// CHECK2-NEXT: store i32 [[ADD11]], i32* [[ARRAYIDX13]], align 4 -// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK2: omp.body.continue: -// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK2: omp.inner.for.inc: -// CHECK2-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] -// CHECK2-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP18:![0-9]+]] -// CHECK2: omp.inner.for.end: -// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK2: omp.loop.exit: -// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) -// CHECK2-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 -// CHECK2-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK2: .omp.final.then: -// CHECK2-NEXT: store i32 10, i32* [[I]], align 4 -// CHECK2-NEXT: store i32 10, i32* [[J]], align 4 -// CHECK2-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK2: .omp.final.done: -// CHECK2-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38 -// CHECK3-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK3-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK3-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK3-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK3-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK3: .execute: -// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) -// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK3-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK3-NEXT: store i32 [[TMP4]], i32* [[L_CASTED]], align 4 -// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[L_CASTED]], align 4 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK3-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32 [[TMP5]]) #[[ATTR2:[0-9]+]] -// CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK3: .omp.deinit: -// CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK3-NEXT: br label [[DOTEXIT:%.*]] -// CHECK3: .exit: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK3-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[I4:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 -// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK3-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK3-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK3-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK3-NEXT: [[TMP1:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* @"_openmp_static_kernel$size", align 4 -// CHECK3-NEXT: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP2]], i16 [[TMP1]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) -// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4 -// CHECK3-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP3]], i32 0 -// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to %struct._globalized_locals_ty* -// CHECK3-NEXT: [[L1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP5]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK3-NEXT: store i32 [[TMP6]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP7]], 0 -// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK3-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK3-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP8]] -// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK3: omp.precond.then: -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK3-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 -// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP11]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) -// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK3-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]] -// CHECK3-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK3: cond.true: -// CHECK3-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK3-NEXT: br label [[COND_END:%.*]] -// CHECK3: cond.false: -// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: br label [[COND_END]] -// CHECK3: cond.end: -// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ [[TMP14]], [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ] -// CHECK3-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP16]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK3: omp.inner.for.cond: -// CHECK3-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], 1 -// CHECK3-NEXT: [[CMP6:%.*]] = icmp slt i32 [[TMP17]], [[ADD]] -// CHECK3-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK3: omp.inner.for.body: -// CHECK3-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP21:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK3-NEXT: store i32 [[TMP21]], i32* [[N_CASTED]], align 4 -// CHECK3-NEXT: [[TMP22:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK3-NEXT: [[TMP23:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK3-NEXT: store i32 [[TMP23]], i32* [[L_CASTED]], align 4 -// CHECK3-NEXT: [[TMP24:%.*]] = load i32, i32* [[L_CASTED]], align 4 -// CHECK3-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP26:%.*]] = inttoptr i32 [[TMP19]] to i8* -// CHECK3-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 -// CHECK3-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP28:%.*]] = inttoptr i32 [[TMP20]] to i8* -// CHECK3-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 -// CHECK3-NEXT: [[TMP29:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK3-NEXT: [[TMP30:%.*]] = inttoptr i32 [[TMP22]] to i8* -// CHECK3-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 4 -// CHECK3-NEXT: [[TMP31:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK3-NEXT: [[TMP32:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* -// CHECK3-NEXT: store i8* [[TMP32]], i8** [[TMP31]], align 4 -// CHECK3-NEXT: [[TMP33:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 -// CHECK3-NEXT: [[TMP34:%.*]] = inttoptr i32 [[TMP24]] to i8* -// CHECK3-NEXT: store i8* [[TMP34]], i8** [[TMP33]], align 4 -// CHECK3-NEXT: [[TMP35:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: [[TMP36:%.*]] = load i32, i32* [[TMP35]], align 4 -// CHECK3-NEXT: [[TMP37:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP36]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP37]], i32 5) -// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK3: omp.inner.for.inc: -// CHECK3-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP38]], [[TMP39]] -// CHECK3-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP40]], [[TMP41]] -// CHECK3-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP42]], [[TMP43]] -// CHECK3-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK3-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP44]], [[TMP45]] -// CHECK3-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] -// CHECK3: cond.true11: -// CHECK3-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK3-NEXT: br label [[COND_END13:%.*]] -// CHECK3: cond.false12: -// CHECK3-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: br label [[COND_END13]] -// CHECK3: cond.end13: -// CHECK3-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP46]], [[COND_TRUE11]] ], [ [[TMP47]], [[COND_FALSE12]] ] -// CHECK3-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP48]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP10:![0-9]+]] -// CHECK3: omp.inner.for.end: -// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK3: omp.loop.exit: -// CHECK3-NEXT: [[TMP49:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: [[TMP50:%.*]] = load i32, i32* [[TMP49]], align 4 -// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP50]]) -// CHECK3-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[TMP52:%.*]] = icmp ne i32 [[TMP51]], 0 -// CHECK3-NEXT: br i1 [[TMP52]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK3: .omp.final.then: -// CHECK3-NEXT: [[TMP53:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP53]], 0 -// CHECK3-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1 -// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV16]], 1 -// CHECK3-NEXT: [[ADD17:%.*]] = add nsw i32 0, [[MUL]] -// CHECK3-NEXT: store i32 [[ADD17]], i32* [[I4]], align 4 -// CHECK3-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK3: .omp.final.done: -// CHECK3-NEXT: [[TMP54:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[TMP55:%.*]] = icmp ne i32 [[TMP54]], 0 -// CHECK3-NEXT: br i1 [[TMP55]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK3: .omp.lastprivate.then: -// CHECK3-NEXT: [[TMP56:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK3-NEXT: store i32 [[TMP56]], i32* [[L_ADDR]], align 4 -// CHECK3-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK3: .omp.lastprivate.done: -// CHECK3-NEXT: br label [[OMP_PRECOND_END]] -// CHECK3: omp.precond.end: -// CHECK3-NEXT: [[TMP57:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK3-NEXT: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[TMP57]]) -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK3-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK3-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK3-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK3-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK3-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK3-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK3-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK3-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK3: omp.precond.then: -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK3-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK3-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) -// CHECK3-NEXT: br label [[OMP_DISPATCH_COND:%.*]] -// CHECK3: omp.dispatch.cond: -// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK3-NEXT: [[CMP4:%.*]] = icmp ugt i32 [[TMP9]], [[TMP10]] -// CHECK3-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK3: cond.true: -// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK3-NEXT: br label [[COND_END:%.*]] -// CHECK3: cond.false: -// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: br label [[COND_END]] -// CHECK3: cond.end: -// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] -// CHECK3-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] -// CHECK3-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] -// CHECK3: omp.dispatch.body: -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK3: omp.inner.for.cond: -// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] -// CHECK3-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK3: omp.inner.for.body: -// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 -// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK3-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 -// CHECK3-NEXT: [[TMP19:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP19]] -// CHECK3-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 -// CHECK3-NEXT: [[TMP20:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK3-NEXT: store i32 [[TMP20]], i32* [[L_ADDR]], align 4 -// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK3: omp.body.continue: -// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK3: omp.inner.for.inc: -// CHECK3-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP21]], 1 -// CHECK3-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP12:![0-9]+]] -// CHECK3: omp.inner.for.end: -// CHECK3-NEXT: br label [[OMP_DISPATCH_INC:%.*]] -// CHECK3: omp.dispatch.inc: -// CHECK3-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK3-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] -// CHECK3-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: br label [[OMP_DISPATCH_COND]] -// CHECK3: omp.dispatch.end: -// CHECK3-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 -// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) -// CHECK3-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 -// CHECK3-NEXT: br i1 [[TMP29]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK3: .omp.final.then: -// CHECK3-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[SUB10:%.*]] = sub nsw i32 [[TMP30]], 0 -// CHECK3-NEXT: [[DIV11:%.*]] = sdiv i32 [[SUB10]], 1 -// CHECK3-NEXT: [[MUL12:%.*]] = mul nsw i32 [[DIV11]], 1 -// CHECK3-NEXT: [[ADD13:%.*]] = add nsw i32 0, [[MUL12]] -// CHECK3-NEXT: store i32 [[ADD13]], i32* [[I3]], align 4 -// CHECK3-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK3: .omp.final.done: -// CHECK3-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0 -// CHECK3-NEXT: br i1 [[TMP32]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK3: .omp.lastprivate.then: -// CHECK3-NEXT: [[TMP33:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK3-NEXT: store i32 [[TMP33]], i32* [[L_ADDR]], align 4 -// CHECK3-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK3: .omp.lastprivate.done: -// CHECK3-NEXT: br label [[OMP_PRECOND_END]] -// CHECK3: omp.precond.end: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44 -// CHECK3-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 -// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK3-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK3-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK3: .execute: -// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK3-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK3-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] -// CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK3: .omp.deinit: -// CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK3-NEXT: br label [[DOTEXIT:%.*]] -// CHECK3: .exit: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__2 -// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 -// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 -// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK3-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK3-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK3-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK3-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK3: omp.precond.then: -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK3-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 -// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK3-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] -// CHECK3-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK3: cond.true: -// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK3-NEXT: br label [[COND_END:%.*]] -// CHECK3: cond.false: -// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: br label [[COND_END]] -// CHECK3: cond.end: -// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] -// CHECK3-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK3: omp.inner.for.cond: -// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 -// CHECK3-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] -// CHECK3-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK3: omp.inner.for.body: -// CHECK3-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK3-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 -// CHECK3-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK3-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP19:%.*]] = inttoptr i32 [[TMP14]] to i8* -// CHECK3-NEXT: store i8* [[TMP19]], i8** [[TMP18]], align 4 -// CHECK3-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP15]] to i8* -// CHECK3-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 -// CHECK3-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK3-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP17]] to i8* -// CHECK3-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 -// CHECK3-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK3-NEXT: [[TMP25:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* -// CHECK3-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 -// CHECK3-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 -// CHECK3-NEXT: [[TMP28:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP27]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP28]], i32 4) -// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK3: omp.inner.for.inc: -// CHECK3-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP29]], [[TMP30]] -// CHECK3-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] -// CHECK3-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] -// CHECK3-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK3-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] -// CHECK3-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] -// CHECK3: cond.true10: -// CHECK3-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK3-NEXT: br label [[COND_END12:%.*]] -// CHECK3: cond.false11: -// CHECK3-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: br label [[COND_END12]] -// CHECK3: cond.end12: -// CHECK3-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP37]], [[COND_TRUE10]] ], [ [[TMP38]], [[COND_FALSE11]] ] -// CHECK3-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP39]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]] -// CHECK3: omp.inner.for.end: -// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK3: omp.loop.exit: -// CHECK3-NEXT: [[TMP40:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 -// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]]) -// CHECK3-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[TMP43:%.*]] = icmp ne i32 [[TMP42]], 0 -// CHECK3-NEXT: br i1 [[TMP43]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK3: .omp.final.then: -// CHECK3-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[SUB14:%.*]] = sub nsw i32 [[TMP44]], 0 -// CHECK3-NEXT: [[DIV15:%.*]] = sdiv i32 [[SUB14]], 1 -// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV15]], 1 -// CHECK3-NEXT: [[ADD16:%.*]] = add nsw i32 0, [[MUL]] -// CHECK3-NEXT: store i32 [[ADD16]], i32* [[I3]], align 4 -// CHECK3-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK3: .omp.final.done: -// CHECK3-NEXT: br label [[OMP_PRECOND_END]] -// CHECK3: omp.precond.end: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__3 -// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 -// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK3-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK3-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK3-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK3-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK3-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK3-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK3: omp.precond.then: -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK3-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK3-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK3: omp.inner.for.cond: -// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK3-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] -// CHECK3-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK3: omp.inner.for.body: -// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 -// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK3-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 -// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i32 0, i32 [[TMP13]] -// CHECK3-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 -// CHECK3-NEXT: [[CONV:%.*]] = sext i16 [[TMP14]] to i32 -// CHECK3-NEXT: [[ADD5:%.*]] = add nsw i32 [[CONV]], 1 -// CHECK3-NEXT: [[CONV6:%.*]] = trunc i32 [[ADD5]] to i16 -// CHECK3-NEXT: store i16 [[CONV6]], i16* [[ARRAYIDX]], align 2 -// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK3: omp.body.continue: -// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK3: omp.inner.for.inc: -// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] -// CHECK3-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP14:![0-9]+]] -// CHECK3: omp.inner.for.end: -// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK3: omp.loop.exit: -// CHECK3-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 -// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) -// CHECK3-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 -// CHECK3-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK3: .omp.final.then: -// CHECK3-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP21]], 0 -// CHECK3-NEXT: [[DIV9:%.*]] = sdiv i32 [[SUB8]], 1 -// CHECK3-NEXT: [[MUL10:%.*]] = mul nsw i32 [[DIV9]], 1 -// CHECK3-NEXT: [[ADD11:%.*]] = add nsw i32 0, [[MUL10]] -// CHECK3-NEXT: store i32 [[ADD11]], i32* [[I3]], align 4 -// CHECK3-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK3: .omp.final.done: -// CHECK3-NEXT: br label [[OMP_PRECOND_END]] -// CHECK3: omp.precond.end: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 -// CHECK3-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK3-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK3: .execute: -// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK3-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] -// CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK3: .omp.deinit: -// CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK3-NEXT: br label [[DOTEXIT:%.*]] -// CHECK3: .exit: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__4 -// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 -// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK3-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 -// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 -// CHECK3-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK3: cond.true: -// CHECK3-NEXT: br label [[COND_END:%.*]] -// CHECK3: cond.false: -// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: br label [[COND_END]] -// CHECK3: cond.end: -// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] -// CHECK3-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK3: omp.inner.for.cond: -// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 -// CHECK3-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK3: omp.inner.for.body: -// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP7]] to i8* -// CHECK3-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 -// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP8]] to i8* -// CHECK3-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 -// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK3-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* -// CHECK3-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 -// CHECK3-NEXT: [[TMP15:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP15]], i32 3) -// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK3: omp.inner.for.inc: -// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP17]] -// CHECK3-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] -// CHECK3-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] -// CHECK3-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP22]], 9 -// CHECK3-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] -// CHECK3: cond.true5: -// CHECK3-NEXT: br label [[COND_END7:%.*]] -// CHECK3: cond.false6: -// CHECK3-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: br label [[COND_END7]] -// CHECK3: cond.end7: -// CHECK3-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP23]], [[COND_FALSE6]] ] -// CHECK3-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP24]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP15:![0-9]+]] -// CHECK3: omp.inner.for.end: -// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK3: omp.loop.exit: -// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) -// CHECK3-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[TMP26:%.*]] = icmp ne i32 [[TMP25]], 0 -// CHECK3-NEXT: br i1 [[TMP26]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK3: .omp.final.then: -// CHECK3-NEXT: store i32 10, i32* [[I]], align 4 -// CHECK3-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK3: .omp.final.done: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__5 -// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK3-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK3-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK3-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 -// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK3: omp.inner.for.cond: -// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK3-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] -// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK3: omp.inner.for.body: -// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 -// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK3-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 -// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 [[TMP9]] -// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -// CHECK3-NEXT: [[ADD1:%.*]] = add nsw i32 [[TMP10]], 1 -// CHECK3-NEXT: store i32 [[ADD1]], i32* [[ARRAYIDX]], align 4 -// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK3: omp.body.continue: -// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK3: omp.inner.for.inc: -// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] -// CHECK3-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]] -// CHECK3: omp.inner.for.end: -// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK3: omp.loop.exit: -// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) -// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 -// CHECK3-NEXT: br i1 [[TMP14]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK3: .omp.final.then: -// CHECK3-NEXT: store i32 10, i32* [[I]], align 4 -// CHECK3-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK3: .omp.final.done: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 -// CHECK3-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK3-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK3-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK3-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK3: .execute: -// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[F_ADDR]], align 4 -// CHECK3-NEXT: store i32 [[TMP2]], i32* [[F_CASTED]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[F_CASTED]], align 4 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK3-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i32 [[TMP3]]) #[[ATTR2]] -// CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK3: .omp.deinit: -// CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK3-NEXT: br label [[DOTEXIT:%.*]] -// CHECK3: .exit: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__6 -// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK3-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[K:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 -// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK3-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK3-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 -// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 -// CHECK3-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK3: cond.true: -// CHECK3-NEXT: br label [[COND_END:%.*]] -// CHECK3: cond.false: -// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: br label [[COND_END]] -// CHECK3: cond.end: -// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] -// CHECK3-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK3: omp.inner.for.cond: -// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 -// CHECK3-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK3: omp.inner.for.body: -// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[F_ADDR]], align 4 -// CHECK3-NEXT: store i32 [[TMP9]], i32* [[F_CASTED]], align 4 -// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[F_CASTED]], align 4 -// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP7]] to i8* -// CHECK3-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 -// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP14:%.*]] = inttoptr i32 [[TMP8]] to i8* -// CHECK3-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 -// CHECK3-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK3-NEXT: [[TMP16:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* -// CHECK3-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 4 -// CHECK3-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK3-NEXT: [[TMP18:%.*]] = inttoptr i32 [[TMP10]] to i8* -// CHECK3-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 4 -// CHECK3-NEXT: [[TMP19:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x [10 x i32]]*, i32)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP19]], i32 4) -// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK3: omp.inner.for.inc: -// CHECK3-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] -// CHECK3-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK3-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] -// CHECK3-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP26]], 99 -// CHECK3-NEXT: br i1 [[CMP5]], label [[COND_TRUE6:%.*]], label [[COND_FALSE7:%.*]] -// CHECK3: cond.true6: -// CHECK3-NEXT: br label [[COND_END8:%.*]] -// CHECK3: cond.false7: -// CHECK3-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: br label [[COND_END8]] -// CHECK3: cond.end8: -// CHECK3-NEXT: [[COND9:%.*]] = phi i32 [ 99, [[COND_TRUE6]] ], [ [[TMP27]], [[COND_FALSE7]] ] -// CHECK3-NEXT: store i32 [[COND9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP28]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]] -// CHECK3: omp.inner.for.end: -// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK3: omp.loop.exit: -// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) -// CHECK3-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0 -// CHECK3-NEXT: br i1 [[TMP30]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK3: .omp.final.then: -// CHECK3-NEXT: store i32 10, i32* [[I]], align 4 -// CHECK3-NEXT: store i32 10, i32* [[J]], align 4 -// CHECK3-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK3: .omp.final.done: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__7 -// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK3-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[K:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK3-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK3-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK3-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK3-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 -// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK3: omp.inner.for.cond: -// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK3-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] -// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK3: omp.inner.for.body: -// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 -// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 -// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK3-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[DIV2:%.*]] = sdiv i32 [[TMP10]], 10 -// CHECK3-NEXT: [[MUL3:%.*]] = mul nsw i32 [[DIV2]], 10 -// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL3]] -// CHECK3-NEXT: [[MUL4:%.*]] = mul nsw i32 [[SUB]], 1 -// CHECK3-NEXT: [[ADD5:%.*]] = add nsw i32 0, [[MUL4]] -// CHECK3-NEXT: store i32 [[ADD5]], i32* [[J]], align 4 -// CHECK3-NEXT: store i32 10, i32* [[K]], align 4 -// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 -// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 -// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[F_ADDR]], align 4 -// CHECK3-NEXT: [[MUL6:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] -// CHECK3-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP11]], [[MUL6]] -// CHECK3-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 -// CHECK3-NEXT: [[ADD8:%.*]] = add nsw i32 [[ADD7]], [[TMP14]] -// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 -// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP15]] -// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 -// CHECK3-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP16]] -// CHECK3-NEXT: store i32 [[ADD8]], i32* [[ARRAYIDX9]], align 4 -// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK3: omp.body.continue: -// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK3: omp.inner.for.inc: -// CHECK3-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] -// CHECK3-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP18:![0-9]+]] -// CHECK3: omp.inner.for.end: -// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK3: omp.loop.exit: -// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) -// CHECK3-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 -// CHECK3-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK3: .omp.final.then: -// CHECK3-NEXT: store i32 10, i32* [[I]], align 4 -// CHECK3-NEXT: store i32 10, i32* [[J]], align 4 -// CHECK3-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK3: .omp.final.done: -// CHECK3-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38 -// CHECK4-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK4-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK4-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK4-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK4-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 -// CHECK4-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK4: .execute: -// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) -// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK4-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 -// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK4-NEXT: store i32 [[TMP4]], i32* [[L_CASTED]], align 4 -// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[L_CASTED]], align 4 -// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK4-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32 [[TMP5]]) #[[ATTR2:[0-9]+]] -// CHECK4-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK4: .omp.deinit: -// CHECK4-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK4-NEXT: br label [[DOTEXIT:%.*]] -// CHECK4: .exit: -// CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK4-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[I4:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 -// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK4-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK4-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK4-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 -// CHECK4-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK4-NEXT: [[TMP1:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* @"_openmp_static_kernel$size", align 4 -// CHECK4-NEXT: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP2]], i16 [[TMP1]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) -// CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4 -// CHECK4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP3]], i32 0 -// CHECK4-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to %struct._globalized_locals_ty* -// CHECK4-NEXT: [[L1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP5]], i32 0, i32 0 -// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK4-NEXT: store i32 [[TMP6]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP7]], 0 -// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK4-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK4-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK4-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP8]] -// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK4: omp.precond.then: -// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK4-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK4-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 -// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP11]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) -// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK4-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]] -// CHECK4-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK4: cond.true: -// CHECK4-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK4-NEXT: br label [[COND_END:%.*]] -// CHECK4: cond.false: -// CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: br label [[COND_END]] -// CHECK4: cond.end: -// CHECK4-NEXT: [[COND:%.*]] = phi i32 [ [[TMP14]], [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ] -// CHECK4-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: store i32 [[TMP16]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK4: omp.inner.for.cond: -// CHECK4-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], 1 -// CHECK4-NEXT: [[CMP6:%.*]] = icmp slt i32 [[TMP17]], [[ADD]] -// CHECK4-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK4: omp.inner.for.body: -// CHECK4-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP21:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK4-NEXT: store i32 [[TMP21]], i32* [[N_CASTED]], align 4 -// CHECK4-NEXT: [[TMP22:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK4-NEXT: [[TMP23:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK4-NEXT: store i32 [[TMP23]], i32* [[L_CASTED]], align 4 -// CHECK4-NEXT: [[TMP24:%.*]] = load i32, i32* [[L_CASTED]], align 4 -// CHECK4-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK4-NEXT: [[TMP26:%.*]] = inttoptr i32 [[TMP19]] to i8* -// CHECK4-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 -// CHECK4-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK4-NEXT: [[TMP28:%.*]] = inttoptr i32 [[TMP20]] to i8* -// CHECK4-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 -// CHECK4-NEXT: [[TMP29:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK4-NEXT: [[TMP30:%.*]] = inttoptr i32 [[TMP22]] to i8* -// CHECK4-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 4 -// CHECK4-NEXT: [[TMP31:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK4-NEXT: [[TMP32:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* -// CHECK4-NEXT: store i8* [[TMP32]], i8** [[TMP31]], align 4 -// CHECK4-NEXT: [[TMP33:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 -// CHECK4-NEXT: [[TMP34:%.*]] = inttoptr i32 [[TMP24]] to i8* -// CHECK4-NEXT: store i8* [[TMP34]], i8** [[TMP33]], align 4 -// CHECK4-NEXT: [[TMP35:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: [[TMP36:%.*]] = load i32, i32* [[TMP35]], align 4 -// CHECK4-NEXT: [[TMP37:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP36]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP37]], i32 5) -// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK4: omp.inner.for.inc: -// CHECK4-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP38]], [[TMP39]] -// CHECK4-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP40]], [[TMP41]] -// CHECK4-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP42]], [[TMP43]] -// CHECK4-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK4-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP44]], [[TMP45]] -// CHECK4-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] -// CHECK4: cond.true11: -// CHECK4-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK4-NEXT: br label [[COND_END13:%.*]] -// CHECK4: cond.false12: -// CHECK4-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: br label [[COND_END13]] -// CHECK4: cond.end13: -// CHECK4-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP46]], [[COND_TRUE11]] ], [ [[TMP47]], [[COND_FALSE12]] ] -// CHECK4-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: store i32 [[TMP48]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP10:![0-9]+]] -// CHECK4: omp.inner.for.end: -// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK4: omp.loop.exit: -// CHECK4-NEXT: [[TMP49:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: [[TMP50:%.*]] = load i32, i32* [[TMP49]], align 4 -// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP50]]) -// CHECK4-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK4-NEXT: [[TMP52:%.*]] = icmp ne i32 [[TMP51]], 0 -// CHECK4-NEXT: br i1 [[TMP52]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK4: .omp.final.then: -// CHECK4-NEXT: [[TMP53:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP53]], 0 -// CHECK4-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1 -// CHECK4-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV16]], 1 -// CHECK4-NEXT: [[ADD17:%.*]] = add nsw i32 0, [[MUL]] -// CHECK4-NEXT: store i32 [[ADD17]], i32* [[I4]], align 4 -// CHECK4-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK4: .omp.final.done: -// CHECK4-NEXT: [[TMP54:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK4-NEXT: [[TMP55:%.*]] = icmp ne i32 [[TMP54]], 0 -// CHECK4-NEXT: br i1 [[TMP55]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK4: .omp.lastprivate.then: -// CHECK4-NEXT: [[TMP56:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK4-NEXT: store i32 [[TMP56]], i32* [[L_ADDR]], align 4 -// CHECK4-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK4: .omp.lastprivate.done: -// CHECK4-NEXT: br label [[OMP_PRECOND_END]] -// CHECK4: omp.precond.end: -// CHECK4-NEXT: [[TMP57:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK4-NEXT: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[TMP57]]) -// CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK4-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK4-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK4-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK4-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK4-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK4-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 -// CHECK4-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK4-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK4-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK4-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK4: omp.precond.then: -// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK4-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK4-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 -// CHECK4-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 -// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK4-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) -// CHECK4-NEXT: br label [[OMP_DISPATCH_COND:%.*]] -// CHECK4: omp.dispatch.cond: -// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK4-NEXT: [[CMP4:%.*]] = icmp ugt i32 [[TMP9]], [[TMP10]] -// CHECK4-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK4: cond.true: -// CHECK4-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK4-NEXT: br label [[COND_END:%.*]] -// CHECK4: cond.false: -// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK4-NEXT: br label [[COND_END]] -// CHECK4: cond.end: -// CHECK4-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] -// CHECK4-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 -// CHECK4-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK4-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK4-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] -// CHECK4-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] -// CHECK4: omp.dispatch.body: -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK4: omp.inner.for.cond: -// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK4-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] -// CHECK4-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK4: omp.inner.for.body: -// CHECK4-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 -// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK4-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 -// CHECK4-NEXT: [[TMP19:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP19]] -// CHECK4-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 -// CHECK4-NEXT: [[TMP20:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK4-NEXT: store i32 [[TMP20]], i32* [[L_ADDR]], align 4 -// CHECK4-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK4: omp.body.continue: -// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK4: omp.inner.for.inc: -// CHECK4-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP21]], 1 -// CHECK4-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP12:![0-9]+]] -// CHECK4: omp.inner.for.end: -// CHECK4-NEXT: br label [[OMP_DISPATCH_INC:%.*]] -// CHECK4: omp.dispatch.inc: -// CHECK4-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK4-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK4-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_LB]], align 4 -// CHECK4-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK4-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] -// CHECK4-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_UB]], align 4 -// CHECK4-NEXT: br label [[OMP_DISPATCH_COND]] -// CHECK4: omp.dispatch.end: -// CHECK4-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 -// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) -// CHECK4-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK4-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 -// CHECK4-NEXT: br i1 [[TMP29]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK4: .omp.final.then: -// CHECK4-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[SUB10:%.*]] = sub nsw i32 [[TMP30]], 0 -// CHECK4-NEXT: [[DIV11:%.*]] = sdiv i32 [[SUB10]], 1 -// CHECK4-NEXT: [[MUL12:%.*]] = mul nsw i32 [[DIV11]], 1 -// CHECK4-NEXT: [[ADD13:%.*]] = add nsw i32 0, [[MUL12]] -// CHECK4-NEXT: store i32 [[ADD13]], i32* [[I3]], align 4 -// CHECK4-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK4: .omp.final.done: -// CHECK4-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK4-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0 -// CHECK4-NEXT: br i1 [[TMP32]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK4: .omp.lastprivate.then: -// CHECK4-NEXT: [[TMP33:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK4-NEXT: store i32 [[TMP33]], i32* [[L_ADDR]], align 4 -// CHECK4-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK4: .omp.lastprivate.done: -// CHECK4-NEXT: br label [[OMP_PRECOND_END]] -// CHECK4: omp.precond.end: -// CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44 -// CHECK4-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 -// CHECK4-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK4-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK4-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK4-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK4: .execute: -// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK4-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 -// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK4-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] -// CHECK4-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK4: .omp.deinit: -// CHECK4-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK4-NEXT: br label [[DOTEXIT:%.*]] -// CHECK4: .exit: -// CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__2 -// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 -// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 -// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK4-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK4-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK4-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK4-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK4-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK4-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK4: omp.precond.then: -// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK4-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK4-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 -// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK4-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] -// CHECK4-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK4: cond.true: -// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK4-NEXT: br label [[COND_END:%.*]] -// CHECK4: cond.false: -// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: br label [[COND_END]] -// CHECK4: cond.end: -// CHECK4-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] -// CHECK4-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK4: omp.inner.for.cond: -// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 -// CHECK4-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] -// CHECK4-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK4: omp.inner.for.body: -// CHECK4-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK4-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 -// CHECK4-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK4-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK4-NEXT: [[TMP19:%.*]] = inttoptr i32 [[TMP14]] to i8* -// CHECK4-NEXT: store i8* [[TMP19]], i8** [[TMP18]], align 4 -// CHECK4-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK4-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP15]] to i8* -// CHECK4-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 -// CHECK4-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK4-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP17]] to i8* -// CHECK4-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 -// CHECK4-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK4-NEXT: [[TMP25:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* -// CHECK4-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 -// CHECK4-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 -// CHECK4-NEXT: [[TMP28:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP27]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP28]], i32 4) -// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK4: omp.inner.for.inc: -// CHECK4-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP29]], [[TMP30]] -// CHECK4-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] -// CHECK4-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] -// CHECK4-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK4-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] -// CHECK4-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] -// CHECK4: cond.true10: -// CHECK4-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK4-NEXT: br label [[COND_END12:%.*]] -// CHECK4: cond.false11: -// CHECK4-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: br label [[COND_END12]] -// CHECK4: cond.end12: -// CHECK4-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP37]], [[COND_TRUE10]] ], [ [[TMP38]], [[COND_FALSE11]] ] -// CHECK4-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: store i32 [[TMP39]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]] -// CHECK4: omp.inner.for.end: -// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK4: omp.loop.exit: -// CHECK4-NEXT: [[TMP40:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 -// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]]) -// CHECK4-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK4-NEXT: [[TMP43:%.*]] = icmp ne i32 [[TMP42]], 0 -// CHECK4-NEXT: br i1 [[TMP43]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK4: .omp.final.then: -// CHECK4-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[SUB14:%.*]] = sub nsw i32 [[TMP44]], 0 -// CHECK4-NEXT: [[DIV15:%.*]] = sdiv i32 [[SUB14]], 1 -// CHECK4-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV15]], 1 -// CHECK4-NEXT: [[ADD16:%.*]] = add nsw i32 0, [[MUL]] -// CHECK4-NEXT: store i32 [[ADD16]], i32* [[I3]], align 4 -// CHECK4-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK4: .omp.final.done: -// CHECK4-NEXT: br label [[OMP_PRECOND_END]] -// CHECK4: omp.precond.end: -// CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__3 -// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 -// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK4-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK4-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK4-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK4-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK4-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK4-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK4-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK4-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK4: omp.precond.then: -// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK4-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK4-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 -// CHECK4-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 -// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK4-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK4-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK4: omp.inner.for.cond: -// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK4-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] -// CHECK4-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK4: omp.inner.for.body: -// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 -// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK4-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 -// CHECK4-NEXT: [[TMP13:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i32 0, i32 [[TMP13]] -// CHECK4-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 -// CHECK4-NEXT: [[CONV:%.*]] = sext i16 [[TMP14]] to i32 -// CHECK4-NEXT: [[ADD5:%.*]] = add nsw i32 [[CONV]], 1 -// CHECK4-NEXT: [[CONV6:%.*]] = trunc i32 [[ADD5]] to i16 -// CHECK4-NEXT: store i16 [[CONV6]], i16* [[ARRAYIDX]], align 2 -// CHECK4-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK4: omp.body.continue: -// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK4: omp.inner.for.inc: -// CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] -// CHECK4-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP14:![0-9]+]] -// CHECK4: omp.inner.for.end: -// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK4: omp.loop.exit: -// CHECK4-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 -// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) -// CHECK4-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK4-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 -// CHECK4-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK4: .omp.final.then: -// CHECK4-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP21]], 0 -// CHECK4-NEXT: [[DIV9:%.*]] = sdiv i32 [[SUB8]], 1 -// CHECK4-NEXT: [[MUL10:%.*]] = mul nsw i32 [[DIV9]], 1 -// CHECK4-NEXT: [[ADD11:%.*]] = add nsw i32 0, [[MUL10]] -// CHECK4-NEXT: store i32 [[ADD11]], i32* [[I3]], align 4 -// CHECK4-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK4: .omp.final.done: -// CHECK4-NEXT: br label [[OMP_PRECOND_END]] -// CHECK4: omp.precond.end: -// CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 -// CHECK4-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK4-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK4: .execute: -// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK4-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] -// CHECK4-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK4: .omp.deinit: -// CHECK4-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK4-NEXT: br label [[DOTEXIT:%.*]] -// CHECK4: .exit: -// CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__4 -// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 -// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK4-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK4-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 -// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 -// CHECK4-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK4: cond.true: -// CHECK4-NEXT: br label [[COND_END:%.*]] -// CHECK4: cond.false: -// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: br label [[COND_END]] -// CHECK4: cond.end: -// CHECK4-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] -// CHECK4-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK4: omp.inner.for.cond: -// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 -// CHECK4-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK4: omp.inner.for.body: -// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK4-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP7]] to i8* -// CHECK4-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 -// CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK4-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP8]] to i8* -// CHECK4-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 -// CHECK4-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK4-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* -// CHECK4-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 -// CHECK4-NEXT: [[TMP15:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP15]], i32 3) -// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK4: omp.inner.for.inc: -// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP17]] -// CHECK4-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] -// CHECK4-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] -// CHECK4-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP22]], 9 -// CHECK4-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] -// CHECK4: cond.true5: -// CHECK4-NEXT: br label [[COND_END7:%.*]] -// CHECK4: cond.false6: -// CHECK4-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: br label [[COND_END7]] -// CHECK4: cond.end7: -// CHECK4-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP23]], [[COND_FALSE6]] ] -// CHECK4-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: store i32 [[TMP24]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP15:![0-9]+]] -// CHECK4: omp.inner.for.end: -// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK4: omp.loop.exit: -// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) -// CHECK4-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK4-NEXT: [[TMP26:%.*]] = icmp ne i32 [[TMP25]], 0 -// CHECK4-NEXT: br i1 [[TMP26]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK4: .omp.final.then: -// CHECK4-NEXT: store i32 10, i32* [[I]], align 4 -// CHECK4-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK4: .omp.final.done: -// CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__5 -// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK4-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK4-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK4-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK4-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 -// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 -// CHECK4-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 -// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK4-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 -// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK4-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK4: omp.inner.for.cond: -// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK4-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] -// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK4: omp.inner.for.body: -// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 -// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK4-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 -// CHECK4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 [[TMP9]] -// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -// CHECK4-NEXT: [[ADD1:%.*]] = add nsw i32 [[TMP10]], 1 -// CHECK4-NEXT: store i32 [[ADD1]], i32* [[ARRAYIDX]], align 4 -// CHECK4-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK4: omp.body.continue: -// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK4: omp.inner.for.inc: -// CHECK4-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] -// CHECK4-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]] -// CHECK4: omp.inner.for.end: -// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK4: omp.loop.exit: -// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) -// CHECK4-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK4-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 -// CHECK4-NEXT: br i1 [[TMP14]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK4: .omp.final.then: -// CHECK4-NEXT: store i32 10, i32* [[I]], align 4 -// CHECK4-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK4: .omp.final.done: -// CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 -// CHECK4-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK4-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK4-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK4-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 -// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK4: .execute: -// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[F_ADDR]], align 4 -// CHECK4-NEXT: store i32 [[TMP2]], i32* [[F_CASTED]], align 4 -// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[F_CASTED]], align 4 -// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK4-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i32 [[TMP3]]) #[[ATTR2]] -// CHECK4-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK4: .omp.deinit: -// CHECK4-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK4-NEXT: br label [[DOTEXIT:%.*]] -// CHECK4: .exit: -// CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__6 -// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK4-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[K:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 -// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK4-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK4-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 -// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK4-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 -// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 -// CHECK4-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK4: cond.true: -// CHECK4-NEXT: br label [[COND_END:%.*]] -// CHECK4: cond.false: -// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: br label [[COND_END]] -// CHECK4: cond.end: -// CHECK4-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] -// CHECK4-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK4: omp.inner.for.cond: -// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 -// CHECK4-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK4: omp.inner.for.body: -// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[F_ADDR]], align 4 -// CHECK4-NEXT: store i32 [[TMP9]], i32* [[F_CASTED]], align 4 -// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[F_CASTED]], align 4 -// CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK4-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP7]] to i8* -// CHECK4-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 -// CHECK4-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK4-NEXT: [[TMP14:%.*]] = inttoptr i32 [[TMP8]] to i8* -// CHECK4-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 -// CHECK4-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK4-NEXT: [[TMP16:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* -// CHECK4-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 4 -// CHECK4-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK4-NEXT: [[TMP18:%.*]] = inttoptr i32 [[TMP10]] to i8* -// CHECK4-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 4 -// CHECK4-NEXT: [[TMP19:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x [10 x i32]]*, i32)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP19]], i32 4) -// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK4: omp.inner.for.inc: -// CHECK4-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] -// CHECK4-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK4-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] -// CHECK4-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP26]], 99 -// CHECK4-NEXT: br i1 [[CMP5]], label [[COND_TRUE6:%.*]], label [[COND_FALSE7:%.*]] -// CHECK4: cond.true6: -// CHECK4-NEXT: br label [[COND_END8:%.*]] -// CHECK4: cond.false7: -// CHECK4-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: br label [[COND_END8]] -// CHECK4: cond.end8: -// CHECK4-NEXT: [[COND9:%.*]] = phi i32 [ 99, [[COND_TRUE6]] ], [ [[TMP27]], [[COND_FALSE7]] ] -// CHECK4-NEXT: store i32 [[COND9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: store i32 [[TMP28]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]] -// CHECK4: omp.inner.for.end: -// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK4: omp.loop.exit: -// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) -// CHECK4-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK4-NEXT: [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0 -// CHECK4-NEXT: br i1 [[TMP30]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK4: .omp.final.then: -// CHECK4-NEXT: store i32 10, i32* [[I]], align 4 -// CHECK4-NEXT: store i32 10, i32* [[J]], align 4 -// CHECK4-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK4: .omp.final.done: -// CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__7 -// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK4-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[K:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK4-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK4-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK4-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK4-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 -// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK4-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 -// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 -// CHECK4-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 -// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK4-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 -// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK4-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK4: omp.inner.for.cond: -// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK4-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] -// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK4: omp.inner.for.body: -// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 -// CHECK4-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 -// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK4-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[DIV2:%.*]] = sdiv i32 [[TMP10]], 10 -// CHECK4-NEXT: [[MUL3:%.*]] = mul nsw i32 [[DIV2]], 10 -// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL3]] -// CHECK4-NEXT: [[MUL4:%.*]] = mul nsw i32 [[SUB]], 1 -// CHECK4-NEXT: [[ADD5:%.*]] = add nsw i32 0, [[MUL4]] -// CHECK4-NEXT: store i32 [[ADD5]], i32* [[J]], align 4 -// CHECK4-NEXT: store i32 10, i32* [[K]], align 4 -// CHECK4-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 -// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 -// CHECK4-NEXT: [[TMP13:%.*]] = load i32, i32* [[F_ADDR]], align 4 -// CHECK4-NEXT: [[MUL6:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] -// CHECK4-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP11]], [[MUL6]] -// CHECK4-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 -// CHECK4-NEXT: [[ADD8:%.*]] = add nsw i32 [[ADD7]], [[TMP14]] -// CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 -// CHECK4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP15]] -// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 -// CHECK4-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP16]] -// CHECK4-NEXT: store i32 [[ADD8]], i32* [[ARRAYIDX9]], align 4 -// CHECK4-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK4: omp.body.continue: -// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK4: omp.inner.for.inc: -// CHECK4-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] -// CHECK4-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP18:![0-9]+]] -// CHECK4: omp.inner.for.end: -// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK4: omp.loop.exit: -// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) -// CHECK4-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK4-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 -// CHECK4-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK4: .omp.final.then: -// CHECK4-NEXT: store i32 10, i32* [[I]], align 4 -// CHECK4-NEXT: store i32 10, i32* [[J]], align 4 -// CHECK4-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK4: .omp.final.done: -// CHECK4-NEXT: ret void -// -// -// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38 -// CHECK5-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK5-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK5-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 -// CHECK5-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK5-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK5-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK5: .execute: -// CHECK5-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) -// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 -// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP4]], i32* [[L_CASTED]], align 4 -// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[L_CASTED]], align 4 -// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK5-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32 [[TMP5]]) #[[ATTR2:[0-9]+]] -// CHECK5-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK5: .omp.deinit: -// CHECK5-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK5-NEXT: br label [[DOTEXIT:%.*]] -// CHECK5: .exit: -// CHECK5-NEXT: ret void -// -// -// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK5-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[I4:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 -// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK5-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 -// CHECK5-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK5-NEXT: [[TMP1:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 4, i16 1) -// CHECK5-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to %struct._globalized_locals_ty* -// CHECK5-NEXT: [[L1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP2]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP3]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK5-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP4]], 0 -// CHECK5-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK5-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK5-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK5-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK5-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] -// CHECK5-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK5: omp.precond.then: -// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK5-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK5-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK5-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP8]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) -// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK5-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]] -// CHECK5-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK5: cond.true: -// CHECK5-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK5-NEXT: br label [[COND_END:%.*]] -// CHECK5: cond.false: -// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: br label [[COND_END]] -// CHECK5: cond.end: -// CHECK5-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] -// CHECK5-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK5-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK5: omp.inner.for.cond: -// CHECK5-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP15]], 1 -// CHECK5-NEXT: [[CMP6:%.*]] = icmp slt i32 [[TMP14]], [[ADD]] -// CHECK5-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK5: omp.inner.for.body: -// CHECK5-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK5-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[TMP18:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP18]], i32* [[N_CASTED]], align 4 -// CHECK5-NEXT: [[TMP19:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK5-NEXT: [[TMP20:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP20]], i32* [[L_CASTED]], align 4 -// CHECK5-NEXT: [[TMP21:%.*]] = load i32, i32* [[L_CASTED]], align 4 -// CHECK5-NEXT: [[TMP22:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP16]] to i8* -// CHECK5-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 -// CHECK5-NEXT: [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP25:%.*]] = inttoptr i32 [[TMP17]] to i8* -// CHECK5-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 -// CHECK5-NEXT: [[TMP26:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK5-NEXT: [[TMP27:%.*]] = inttoptr i32 [[TMP19]] to i8* -// CHECK5-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 4 -// CHECK5-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK5-NEXT: [[TMP29:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* -// CHECK5-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 4 -// CHECK5-NEXT: [[TMP30:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 -// CHECK5-NEXT: [[TMP31:%.*]] = inttoptr i32 [[TMP21]] to i8* -// CHECK5-NEXT: store i8* [[TMP31]], i8** [[TMP30]], align 4 -// CHECK5-NEXT: [[TMP32:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: [[TMP33:%.*]] = load i32, i32* [[TMP32]], align 4 -// CHECK5-NEXT: [[TMP34:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP33]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP34]], i32 5) -// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK5: omp.inner.for.inc: -// CHECK5-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK5-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] -// CHECK5-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK5-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK5-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP37]], [[TMP38]] -// CHECK5-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK5-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK5-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP39]], [[TMP40]] -// CHECK5-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK5-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]] -// CHECK5-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] -// CHECK5: cond.true11: -// CHECK5-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK5-NEXT: br label [[COND_END13:%.*]] -// CHECK5: cond.false12: -// CHECK5-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: br label [[COND_END13]] -// CHECK5: cond.end13: -// CHECK5-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP43]], [[COND_TRUE11]] ], [ [[TMP44]], [[COND_FALSE12]] ] -// CHECK5-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK5-NEXT: store i32 [[TMP45]], i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP10:![0-9]+]] -// CHECK5: omp.inner.for.end: -// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK5: omp.loop.exit: -// CHECK5-NEXT: [[TMP46:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: [[TMP47:%.*]] = load i32, i32* [[TMP46]], align 4 -// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP47]]) -// CHECK5-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK5-NEXT: [[TMP49:%.*]] = icmp ne i32 [[TMP48]], 0 -// CHECK5-NEXT: br i1 [[TMP49]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK5: .omp.final.then: -// CHECK5-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK5-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP50]], 0 -// CHECK5-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1 -// CHECK5-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV16]], 1 -// CHECK5-NEXT: [[ADD17:%.*]] = add nsw i32 0, [[MUL]] -// CHECK5-NEXT: store i32 [[ADD17]], i32* [[I4]], align 4 -// CHECK5-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK5: .omp.final.done: -// CHECK5-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK5-NEXT: [[TMP52:%.*]] = icmp ne i32 [[TMP51]], 0 -// CHECK5-NEXT: br i1 [[TMP52]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK5: .omp.lastprivate.then: -// CHECK5-NEXT: [[TMP53:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP53]], i32* [[L_ADDR]], align 4 -// CHECK5-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK5: .omp.lastprivate.done: -// CHECK5-NEXT: br label [[OMP_PRECOND_END]] -// CHECK5: omp.precond.end: -// CHECK5-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP1]]) -// CHECK5-NEXT: ret void -// -// -// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK5-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK5-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK5-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK5-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 -// CHECK5-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK5-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK5-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK5-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK5-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK5-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK5-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK5-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK5-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK5: omp.precond.then: -// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK5-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 -// CHECK5-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 -// CHECK5-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK5-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK5-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) -// CHECK5-NEXT: br label [[OMP_DISPATCH_COND:%.*]] -// CHECK5: omp.dispatch.cond: -// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK5-NEXT: [[CMP4:%.*]] = icmp ugt i32 [[TMP9]], [[TMP10]] -// CHECK5-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK5: cond.true: -// CHECK5-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK5-NEXT: br label [[COND_END:%.*]] -// CHECK5: cond.false: -// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK5-NEXT: br label [[COND_END]] -// CHECK5: cond.end: -// CHECK5-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] -// CHECK5-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 -// CHECK5-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK5-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK5-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] -// CHECK5-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] -// CHECK5: omp.dispatch.body: -// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK5: omp.inner.for.cond: -// CHECK5-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK5-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] -// CHECK5-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK5: omp.inner.for.body: -// CHECK5-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 -// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK5-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 -// CHECK5-NEXT: [[TMP19:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK5-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP19]] -// CHECK5-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 -// CHECK5-NEXT: [[TMP20:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK5-NEXT: store i32 [[TMP20]], i32* [[L_ADDR]], align 4 -// CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK5: omp.body.continue: -// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK5: omp.inner.for.inc: -// CHECK5-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP21]], 1 -// CHECK5-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP12:![0-9]+]] -// CHECK5: omp.inner.for.end: -// CHECK5-NEXT: br label [[OMP_DISPATCH_INC:%.*]] -// CHECK5: omp.dispatch.inc: -// CHECK5-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK5-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK5-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK5-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_LB]], align 4 -// CHECK5-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK5-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK5-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] -// CHECK5-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_UB]], align 4 -// CHECK5-NEXT: br label [[OMP_DISPATCH_COND]] -// CHECK5: omp.dispatch.end: -// CHECK5-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 -// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) -// CHECK5-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK5-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 -// CHECK5-NEXT: br i1 [[TMP29]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK5: .omp.final.then: -// CHECK5-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK5-NEXT: [[SUB10:%.*]] = sub nsw i32 [[TMP30]], 0 -// CHECK5-NEXT: [[DIV11:%.*]] = sdiv i32 [[SUB10]], 1 -// CHECK5-NEXT: [[MUL12:%.*]] = mul nsw i32 [[DIV11]], 1 -// CHECK5-NEXT: [[ADD13:%.*]] = add nsw i32 0, [[MUL12]] -// CHECK5-NEXT: store i32 [[ADD13]], i32* [[I3]], align 4 -// CHECK5-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK5: .omp.final.done: -// CHECK5-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK5-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0 -// CHECK5-NEXT: br i1 [[TMP32]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK5: .omp.lastprivate.then: -// CHECK5-NEXT: [[TMP33:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP33]], i32* [[L_ADDR]], align 4 -// CHECK5-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK5: .omp.lastprivate.done: -// CHECK5-NEXT: br label [[OMP_PRECOND_END]] -// CHECK5: omp.precond.end: -// CHECK5-NEXT: ret void -// -// -// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44 -// CHECK5-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 -// CHECK5-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK5-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK5-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK5-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK5-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK5: .execute: -// CHECK5-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 -// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK5-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] -// CHECK5-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK5: .omp.deinit: -// CHECK5-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK5-NEXT: br label [[DOTEXIT:%.*]] -// CHECK5: .exit: -// CHECK5-NEXT: ret void -// -// -// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__2 -// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 -// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 -// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK5-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK5-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK5-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK5-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK5-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK5-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK5-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK5-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK5-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK5-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK5: omp.precond.then: -// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK5-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK5-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 -// CHECK5-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK5-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] -// CHECK5-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK5: cond.true: -// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK5-NEXT: br label [[COND_END:%.*]] -// CHECK5: cond.false: -// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: br label [[COND_END]] -// CHECK5: cond.end: -// CHECK5-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] -// CHECK5-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK5-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK5: omp.inner.for.cond: -// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 -// CHECK5-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] -// CHECK5-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK5: omp.inner.for.body: -// CHECK5-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK5-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 -// CHECK5-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK5-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP19:%.*]] = inttoptr i32 [[TMP14]] to i8* -// CHECK5-NEXT: store i8* [[TMP19]], i8** [[TMP18]], align 4 -// CHECK5-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP15]] to i8* -// CHECK5-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 -// CHECK5-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK5-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP17]] to i8* -// CHECK5-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 -// CHECK5-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK5-NEXT: [[TMP25:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* -// CHECK5-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 -// CHECK5-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 -// CHECK5-NEXT: [[TMP28:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP27]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP28]], i32 4) -// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK5: omp.inner.for.inc: -// CHECK5-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK5-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP29]], [[TMP30]] -// CHECK5-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK5-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK5-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] -// CHECK5-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK5-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK5-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] -// CHECK5-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK5-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] -// CHECK5-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] -// CHECK5: cond.true10: -// CHECK5-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK5-NEXT: br label [[COND_END12:%.*]] -// CHECK5: cond.false11: -// CHECK5-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: br label [[COND_END12]] -// CHECK5: cond.end12: -// CHECK5-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP37]], [[COND_TRUE10]] ], [ [[TMP38]], [[COND_FALSE11]] ] -// CHECK5-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK5-NEXT: store i32 [[TMP39]], i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]] -// CHECK5: omp.inner.for.end: -// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK5: omp.loop.exit: -// CHECK5-NEXT: [[TMP40:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 -// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]]) -// CHECK5-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK5-NEXT: [[TMP43:%.*]] = icmp ne i32 [[TMP42]], 0 -// CHECK5-NEXT: br i1 [[TMP43]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK5: .omp.final.then: -// CHECK5-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK5-NEXT: [[SUB14:%.*]] = sub nsw i32 [[TMP44]], 0 -// CHECK5-NEXT: [[DIV15:%.*]] = sdiv i32 [[SUB14]], 1 -// CHECK5-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV15]], 1 -// CHECK5-NEXT: [[ADD16:%.*]] = add nsw i32 0, [[MUL]] -// CHECK5-NEXT: store i32 [[ADD16]], i32* [[I3]], align 4 -// CHECK5-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK5: .omp.final.done: -// CHECK5-NEXT: br label [[OMP_PRECOND_END]] -// CHECK5: omp.precond.end: -// CHECK5-NEXT: ret void -// -// -// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__3 -// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 -// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK5-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK5-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK5-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK5-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK5-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK5-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK5-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK5-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK5-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK5-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK5-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK5-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK5: omp.precond.then: -// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK5-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 -// CHECK5-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 -// CHECK5-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK5-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK5-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK5-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK5: omp.inner.for.cond: -// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK5-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] -// CHECK5-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK5: omp.inner.for.body: -// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 -// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK5-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 -// CHECK5-NEXT: [[TMP13:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK5-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i32 0, i32 [[TMP13]] -// CHECK5-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 -// CHECK5-NEXT: [[CONV:%.*]] = sext i16 [[TMP14]] to i32 -// CHECK5-NEXT: [[ADD5:%.*]] = add nsw i32 [[CONV]], 1 -// CHECK5-NEXT: [[CONV6:%.*]] = trunc i32 [[ADD5]] to i16 -// CHECK5-NEXT: store i16 [[CONV6]], i16* [[ARRAYIDX]], align 2 -// CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK5: omp.body.continue: -// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK5: omp.inner.for.inc: -// CHECK5-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK5-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] -// CHECK5-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP14:![0-9]+]] -// CHECK5: omp.inner.for.end: -// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK5: omp.loop.exit: -// CHECK5-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 -// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) -// CHECK5-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK5-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 -// CHECK5-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK5: .omp.final.then: -// CHECK5-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK5-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP21]], 0 -// CHECK5-NEXT: [[DIV9:%.*]] = sdiv i32 [[SUB8]], 1 -// CHECK5-NEXT: [[MUL10:%.*]] = mul nsw i32 [[DIV9]], 1 -// CHECK5-NEXT: [[ADD11:%.*]] = add nsw i32 0, [[MUL10]] -// CHECK5-NEXT: store i32 [[ADD11]], i32* [[I3]], align 4 -// CHECK5-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK5: .omp.final.done: -// CHECK5-NEXT: br label [[OMP_PRECOND_END]] -// CHECK5: omp.precond.end: -// CHECK5-NEXT: ret void -// -// -// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 -// CHECK5-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK5-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK5-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK5-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK5-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK5: .execute: -// CHECK5-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK5-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] -// CHECK5-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK5: .omp.deinit: -// CHECK5-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK5-NEXT: br label [[DOTEXIT:%.*]] -// CHECK5: .exit: -// CHECK5-NEXT: ret void -// -// -// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__4 -// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 -// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK5-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK5-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK5-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK5-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 -// CHECK5-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 -// CHECK5-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK5: cond.true: -// CHECK5-NEXT: br label [[COND_END:%.*]] -// CHECK5: cond.false: -// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: br label [[COND_END]] -// CHECK5: cond.end: -// CHECK5-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] -// CHECK5-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK5-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK5: omp.inner.for.cond: -// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 -// CHECK5-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK5: omp.inner.for.body: -// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP7]] to i8* -// CHECK5-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 -// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP8]] to i8* -// CHECK5-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 -// CHECK5-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK5-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* -// CHECK5-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 -// CHECK5-NEXT: [[TMP15:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP15]], i32 3) -// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK5: omp.inner.for.inc: -// CHECK5-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP17]] -// CHECK5-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK5-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK5-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] -// CHECK5-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK5-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK5-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] -// CHECK5-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP22]], 9 -// CHECK5-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] -// CHECK5: cond.true5: -// CHECK5-NEXT: br label [[COND_END7:%.*]] -// CHECK5: cond.false6: -// CHECK5-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: br label [[COND_END7]] -// CHECK5: cond.end7: -// CHECK5-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP23]], [[COND_FALSE6]] ] -// CHECK5-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK5-NEXT: store i32 [[TMP24]], i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP15:![0-9]+]] -// CHECK5: omp.inner.for.end: -// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK5: omp.loop.exit: -// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) -// CHECK5-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK5-NEXT: [[TMP26:%.*]] = icmp ne i32 [[TMP25]], 0 -// CHECK5-NEXT: br i1 [[TMP26]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK5: .omp.final.then: -// CHECK5-NEXT: store i32 10, i32* [[I]], align 4 -// CHECK5-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK5: .omp.final.done: -// CHECK5-NEXT: ret void -// -// -// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__5 -// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK5-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK5-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK5-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK5-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 -// CHECK5-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 -// CHECK5-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 -// CHECK5-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK5-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 -// CHECK5-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK5-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK5: omp.inner.for.cond: -// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK5-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] -// CHECK5-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK5: omp.inner.for.body: -// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 -// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK5-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 -// CHECK5-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 [[TMP9]] -// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -// CHECK5-NEXT: [[ADD1:%.*]] = add nsw i32 [[TMP10]], 1 -// CHECK5-NEXT: store i32 [[ADD1]], i32* [[ARRAYIDX]], align 4 -// CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK5: omp.body.continue: -// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK5: omp.inner.for.inc: -// CHECK5-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK5-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] -// CHECK5-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]] -// CHECK5: omp.inner.for.end: -// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK5: omp.loop.exit: -// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) -// CHECK5-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK5-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 -// CHECK5-NEXT: br i1 [[TMP14]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK5: .omp.final.then: -// CHECK5-NEXT: store i32 10, i32* [[I]], align 4 -// CHECK5-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK5: .omp.final.done: -// CHECK5-NEXT: ret void -// -// -// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 -// CHECK5-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK5-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK5-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 -// CHECK5-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK5-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK5-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK5: .execute: -// CHECK5-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[F_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP2]], i32* [[F_CASTED]], align 4 -// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[F_CASTED]], align 4 -// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK5-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i32 [[TMP3]]) #[[ATTR2]] -// CHECK5-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK5: .omp.deinit: -// CHECK5-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK5-NEXT: br label [[DOTEXIT:%.*]] -// CHECK5: .exit: -// CHECK5-NEXT: ret void -// -// -// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__6 -// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK5-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[K:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 -// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK5-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 -// CHECK5-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK5-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK5-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 -// CHECK5-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 -// CHECK5-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK5: cond.true: -// CHECK5-NEXT: br label [[COND_END:%.*]] -// CHECK5: cond.false: -// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: br label [[COND_END]] -// CHECK5: cond.end: -// CHECK5-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] -// CHECK5-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK5-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK5: omp.inner.for.cond: -// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 -// CHECK5-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK5: omp.inner.for.body: -// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[F_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP9]], i32* [[F_CASTED]], align 4 -// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[F_CASTED]], align 4 -// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP7]] to i8* -// CHECK5-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 -// CHECK5-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP14:%.*]] = inttoptr i32 [[TMP8]] to i8* -// CHECK5-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 -// CHECK5-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK5-NEXT: [[TMP16:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* -// CHECK5-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 4 -// CHECK5-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK5-NEXT: [[TMP18:%.*]] = inttoptr i32 [[TMP10]] to i8* -// CHECK5-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 4 -// CHECK5-NEXT: [[TMP19:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x [10 x i32]]*, i32)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP19]], i32 4) -// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK5: omp.inner.for.inc: -// CHECK5-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] -// CHECK5-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK5-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK5-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK5-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK5-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK5-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] -// CHECK5-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP26]], 99 -// CHECK5-NEXT: br i1 [[CMP5]], label [[COND_TRUE6:%.*]], label [[COND_FALSE7:%.*]] -// CHECK5: cond.true6: -// CHECK5-NEXT: br label [[COND_END8:%.*]] -// CHECK5: cond.false7: -// CHECK5-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: br label [[COND_END8]] -// CHECK5: cond.end8: -// CHECK5-NEXT: [[COND9:%.*]] = phi i32 [ 99, [[COND_TRUE6]] ], [ [[TMP27]], [[COND_FALSE7]] ] -// CHECK5-NEXT: store i32 [[COND9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK5-NEXT: store i32 [[TMP28]], i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]] -// CHECK5: omp.inner.for.end: -// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK5: omp.loop.exit: -// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) -// CHECK5-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK5-NEXT: [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0 -// CHECK5-NEXT: br i1 [[TMP30]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK5: .omp.final.then: -// CHECK5-NEXT: store i32 10, i32* [[I]], align 4 -// CHECK5-NEXT: store i32 10, i32* [[J]], align 4 -// CHECK5-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK5: .omp.final.done: -// CHECK5-NEXT: ret void -// -// -// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__7 -// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK5-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[K:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK5-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK5-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 -// CHECK5-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK5-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 -// CHECK5-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 -// CHECK5-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 -// CHECK5-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK5-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 -// CHECK5-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK5-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK5: omp.inner.for.cond: -// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK5-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] -// CHECK5-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK5: omp.inner.for.body: -// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 -// CHECK5-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 -// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK5-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[DIV2:%.*]] = sdiv i32 [[TMP10]], 10 -// CHECK5-NEXT: [[MUL3:%.*]] = mul nsw i32 [[DIV2]], 10 -// CHECK5-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL3]] -// CHECK5-NEXT: [[MUL4:%.*]] = mul nsw i32 [[SUB]], 1 -// CHECK5-NEXT: [[ADD5:%.*]] = add nsw i32 0, [[MUL4]] -// CHECK5-NEXT: store i32 [[ADD5]], i32* [[J]], align 4 -// CHECK5-NEXT: store i32 10, i32* [[K]], align 4 -// CHECK5-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 -// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 -// CHECK5-NEXT: [[TMP13:%.*]] = load i32, i32* [[F_ADDR]], align 4 -// CHECK5-NEXT: [[MUL6:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] -// CHECK5-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP11]], [[MUL6]] -// CHECK5-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 -// CHECK5-NEXT: [[ADD8:%.*]] = add nsw i32 [[ADD7]], [[TMP14]] -// CHECK5-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 -// CHECK5-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP15]] -// CHECK5-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 -// CHECK5-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP16]] -// CHECK5-NEXT: store i32 [[ADD8]], i32* [[ARRAYIDX9]], align 4 -// CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK5: omp.body.continue: -// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK5: omp.inner.for.inc: -// CHECK5-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK5-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] -// CHECK5-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP18:![0-9]+]] -// CHECK5: omp.inner.for.end: -// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK5: omp.loop.exit: -// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) -// CHECK5-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK5-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 -// CHECK5-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK5: .omp.final.then: -// CHECK5-NEXT: store i32 10, i32* [[I]], align 4 -// CHECK5-NEXT: store i32 10, i32* [[J]], align 4 -// CHECK5-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK5: .omp.final.done: -// CHECK5-NEXT: ret void -// -// -// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38 -// CHECK6-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK6-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK6-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 -// CHECK6-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK6-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK6-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK6: .execute: -// CHECK6-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) -// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 -// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP4]], i32* [[L_CASTED]], align 4 -// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[L_CASTED]], align 4 -// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK6-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32 [[TMP5]]) #[[ATTR2:[0-9]+]] -// CHECK6-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK6: .omp.deinit: -// CHECK6-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK6-NEXT: br label [[DOTEXIT:%.*]] -// CHECK6: .exit: -// CHECK6-NEXT: ret void -// -// -// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK6-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[I4:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 -// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK6-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK6-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 -// CHECK6-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK6-NEXT: [[TMP1:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 4, i16 1) -// CHECK6-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to %struct._globalized_locals_ty* -// CHECK6-NEXT: [[L1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP2]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP3]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK6-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP4]], 0 -// CHECK6-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK6-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK6-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK6-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK6-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] -// CHECK6-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK6: omp.precond.then: -// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK6-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK6-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP8]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) -// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK6-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]] -// CHECK6-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK6: cond.true: -// CHECK6-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK6-NEXT: br label [[COND_END:%.*]] -// CHECK6: cond.false: -// CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: br label [[COND_END]] -// CHECK6: cond.end: -// CHECK6-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] -// CHECK6-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK6-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK6: omp.inner.for.cond: -// CHECK6-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP15]], 1 -// CHECK6-NEXT: [[CMP6:%.*]] = icmp slt i32 [[TMP14]], [[ADD]] -// CHECK6-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK6: omp.inner.for.body: -// CHECK6-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK6-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[TMP18:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP18]], i32* [[N_CASTED]], align 4 -// CHECK6-NEXT: [[TMP19:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK6-NEXT: [[TMP20:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP20]], i32* [[L_CASTED]], align 4 -// CHECK6-NEXT: [[TMP21:%.*]] = load i32, i32* [[L_CASTED]], align 4 -// CHECK6-NEXT: [[TMP22:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP16]] to i8* -// CHECK6-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 -// CHECK6-NEXT: [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP25:%.*]] = inttoptr i32 [[TMP17]] to i8* -// CHECK6-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 -// CHECK6-NEXT: [[TMP26:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK6-NEXT: [[TMP27:%.*]] = inttoptr i32 [[TMP19]] to i8* -// CHECK6-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 4 -// CHECK6-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK6-NEXT: [[TMP29:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* -// CHECK6-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 4 -// CHECK6-NEXT: [[TMP30:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 -// CHECK6-NEXT: [[TMP31:%.*]] = inttoptr i32 [[TMP21]] to i8* -// CHECK6-NEXT: store i8* [[TMP31]], i8** [[TMP30]], align 4 -// CHECK6-NEXT: [[TMP32:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: [[TMP33:%.*]] = load i32, i32* [[TMP32]], align 4 -// CHECK6-NEXT: [[TMP34:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP33]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP34]], i32 5) -// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK6: omp.inner.for.inc: -// CHECK6-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK6-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] -// CHECK6-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK6-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK6-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP37]], [[TMP38]] -// CHECK6-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK6-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK6-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP39]], [[TMP40]] -// CHECK6-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK6-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]] -// CHECK6-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] -// CHECK6: cond.true11: -// CHECK6-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK6-NEXT: br label [[COND_END13:%.*]] -// CHECK6: cond.false12: -// CHECK6-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: br label [[COND_END13]] -// CHECK6: cond.end13: -// CHECK6-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP43]], [[COND_TRUE11]] ], [ [[TMP44]], [[COND_FALSE12]] ] -// CHECK6-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK6-NEXT: store i32 [[TMP45]], i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP10:![0-9]+]] -// CHECK6: omp.inner.for.end: -// CHECK6-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK6: omp.loop.exit: -// CHECK6-NEXT: [[TMP46:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: [[TMP47:%.*]] = load i32, i32* [[TMP46]], align 4 -// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP47]]) -// CHECK6-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK6-NEXT: [[TMP49:%.*]] = icmp ne i32 [[TMP48]], 0 -// CHECK6-NEXT: br i1 [[TMP49]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK6: .omp.final.then: -// CHECK6-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK6-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP50]], 0 -// CHECK6-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1 -// CHECK6-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV16]], 1 -// CHECK6-NEXT: [[ADD17:%.*]] = add nsw i32 0, [[MUL]] -// CHECK6-NEXT: store i32 [[ADD17]], i32* [[I4]], align 4 -// CHECK6-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK6: .omp.final.done: -// CHECK6-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK6-NEXT: [[TMP52:%.*]] = icmp ne i32 [[TMP51]], 0 -// CHECK6-NEXT: br i1 [[TMP52]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK6: .omp.lastprivate.then: -// CHECK6-NEXT: [[TMP53:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP53]], i32* [[L_ADDR]], align 4 -// CHECK6-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK6: .omp.lastprivate.done: -// CHECK6-NEXT: br label [[OMP_PRECOND_END]] -// CHECK6: omp.precond.end: -// CHECK6-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP1]]) -// CHECK6-NEXT: ret void -// -// -// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK6-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK6-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK6-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK6-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK6-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 -// CHECK6-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK6-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK6-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK6-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK6-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK6-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK6-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK6-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK6-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK6: omp.precond.then: -// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK6-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 -// CHECK6-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 -// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK6-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) -// CHECK6-NEXT: br label [[OMP_DISPATCH_COND:%.*]] -// CHECK6: omp.dispatch.cond: -// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK6-NEXT: [[CMP4:%.*]] = icmp ugt i32 [[TMP9]], [[TMP10]] -// CHECK6-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK6: cond.true: -// CHECK6-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK6-NEXT: br label [[COND_END:%.*]] -// CHECK6: cond.false: -// CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK6-NEXT: br label [[COND_END]] -// CHECK6: cond.end: -// CHECK6-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] -// CHECK6-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 -// CHECK6-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK6-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK6-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] -// CHECK6-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] -// CHECK6: omp.dispatch.body: -// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK6: omp.inner.for.cond: -// CHECK6-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK6-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] -// CHECK6-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK6: omp.inner.for.body: -// CHECK6-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 -// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK6-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 -// CHECK6-NEXT: [[TMP19:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK6-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP19]] -// CHECK6-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 -// CHECK6-NEXT: [[TMP20:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK6-NEXT: store i32 [[TMP20]], i32* [[L_ADDR]], align 4 -// CHECK6-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK6: omp.body.continue: -// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK6: omp.inner.for.inc: -// CHECK6-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP21]], 1 -// CHECK6-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP12:![0-9]+]] -// CHECK6: omp.inner.for.end: -// CHECK6-NEXT: br label [[OMP_DISPATCH_INC:%.*]] -// CHECK6: omp.dispatch.inc: -// CHECK6-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK6-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK6-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK6-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_LB]], align 4 -// CHECK6-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK6-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK6-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] -// CHECK6-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_UB]], align 4 -// CHECK6-NEXT: br label [[OMP_DISPATCH_COND]] -// CHECK6: omp.dispatch.end: -// CHECK6-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 -// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) -// CHECK6-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK6-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 -// CHECK6-NEXT: br i1 [[TMP29]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK6: .omp.final.then: -// CHECK6-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK6-NEXT: [[SUB10:%.*]] = sub nsw i32 [[TMP30]], 0 -// CHECK6-NEXT: [[DIV11:%.*]] = sdiv i32 [[SUB10]], 1 -// CHECK6-NEXT: [[MUL12:%.*]] = mul nsw i32 [[DIV11]], 1 -// CHECK6-NEXT: [[ADD13:%.*]] = add nsw i32 0, [[MUL12]] -// CHECK6-NEXT: store i32 [[ADD13]], i32* [[I3]], align 4 -// CHECK6-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK6: .omp.final.done: -// CHECK6-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK6-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0 -// CHECK6-NEXT: br i1 [[TMP32]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK6: .omp.lastprivate.then: -// CHECK6-NEXT: [[TMP33:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP33]], i32* [[L_ADDR]], align 4 -// CHECK6-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK6: .omp.lastprivate.done: -// CHECK6-NEXT: br label [[OMP_PRECOND_END]] -// CHECK6: omp.precond.end: -// CHECK6-NEXT: ret void -// -// -// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44 -// CHECK6-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 -// CHECK6-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK6-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK6-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK6-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK6-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK6: .execute: -// CHECK6-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 -// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK6-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] -// CHECK6-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK6: .omp.deinit: -// CHECK6-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK6-NEXT: br label [[DOTEXIT:%.*]] -// CHECK6: .exit: -// CHECK6-NEXT: ret void -// -// -// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__2 -// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 -// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 -// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK6-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK6-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK6-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK6-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK6-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK6-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK6-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK6-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK6-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK6-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK6-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK6: omp.precond.then: -// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK6-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK6-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 -// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK6-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] -// CHECK6-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK6: cond.true: -// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK6-NEXT: br label [[COND_END:%.*]] -// CHECK6: cond.false: -// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: br label [[COND_END]] -// CHECK6: cond.end: -// CHECK6-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] -// CHECK6-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK6-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK6: omp.inner.for.cond: -// CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 -// CHECK6-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] -// CHECK6-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK6: omp.inner.for.body: -// CHECK6-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK6-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 -// CHECK6-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK6-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP19:%.*]] = inttoptr i32 [[TMP14]] to i8* -// CHECK6-NEXT: store i8* [[TMP19]], i8** [[TMP18]], align 4 -// CHECK6-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP15]] to i8* -// CHECK6-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 -// CHECK6-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK6-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP17]] to i8* -// CHECK6-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 -// CHECK6-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK6-NEXT: [[TMP25:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* -// CHECK6-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 -// CHECK6-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 -// CHECK6-NEXT: [[TMP28:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP27]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP28]], i32 4) -// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK6: omp.inner.for.inc: -// CHECK6-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK6-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP29]], [[TMP30]] -// CHECK6-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK6-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK6-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] -// CHECK6-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK6-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK6-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] -// CHECK6-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK6-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] -// CHECK6-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] -// CHECK6: cond.true10: -// CHECK6-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK6-NEXT: br label [[COND_END12:%.*]] -// CHECK6: cond.false11: -// CHECK6-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: br label [[COND_END12]] -// CHECK6: cond.end12: -// CHECK6-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP37]], [[COND_TRUE10]] ], [ [[TMP38]], [[COND_FALSE11]] ] -// CHECK6-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK6-NEXT: store i32 [[TMP39]], i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]] -// CHECK6: omp.inner.for.end: -// CHECK6-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK6: omp.loop.exit: -// CHECK6-NEXT: [[TMP40:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 -// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]]) -// CHECK6-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK6-NEXT: [[TMP43:%.*]] = icmp ne i32 [[TMP42]], 0 -// CHECK6-NEXT: br i1 [[TMP43]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK6: .omp.final.then: -// CHECK6-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK6-NEXT: [[SUB14:%.*]] = sub nsw i32 [[TMP44]], 0 -// CHECK6-NEXT: [[DIV15:%.*]] = sdiv i32 [[SUB14]], 1 -// CHECK6-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV15]], 1 -// CHECK6-NEXT: [[ADD16:%.*]] = add nsw i32 0, [[MUL]] -// CHECK6-NEXT: store i32 [[ADD16]], i32* [[I3]], align 4 -// CHECK6-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK6: .omp.final.done: -// CHECK6-NEXT: br label [[OMP_PRECOND_END]] -// CHECK6: omp.precond.end: -// CHECK6-NEXT: ret void -// -// -// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__3 -// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 -// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK6-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK6-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK6-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK6-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK6-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK6-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK6-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK6-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK6-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK6-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK6-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK6-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK6-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK6: omp.precond.then: -// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK6-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 -// CHECK6-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 -// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK6-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK6-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK6: omp.inner.for.cond: -// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK6-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] -// CHECK6-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK6: omp.inner.for.body: -// CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 -// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK6-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 -// CHECK6-NEXT: [[TMP13:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK6-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i32 0, i32 [[TMP13]] -// CHECK6-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 -// CHECK6-NEXT: [[CONV:%.*]] = sext i16 [[TMP14]] to i32 -// CHECK6-NEXT: [[ADD5:%.*]] = add nsw i32 [[CONV]], 1 -// CHECK6-NEXT: [[CONV6:%.*]] = trunc i32 [[ADD5]] to i16 -// CHECK6-NEXT: store i16 [[CONV6]], i16* [[ARRAYIDX]], align 2 -// CHECK6-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK6: omp.body.continue: -// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK6: omp.inner.for.inc: -// CHECK6-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK6-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] -// CHECK6-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP14:![0-9]+]] -// CHECK6: omp.inner.for.end: -// CHECK6-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK6: omp.loop.exit: -// CHECK6-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 -// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) -// CHECK6-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK6-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 -// CHECK6-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK6: .omp.final.then: -// CHECK6-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK6-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP21]], 0 -// CHECK6-NEXT: [[DIV9:%.*]] = sdiv i32 [[SUB8]], 1 -// CHECK6-NEXT: [[MUL10:%.*]] = mul nsw i32 [[DIV9]], 1 -// CHECK6-NEXT: [[ADD11:%.*]] = add nsw i32 0, [[MUL10]] -// CHECK6-NEXT: store i32 [[ADD11]], i32* [[I3]], align 4 -// CHECK6-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK6: .omp.final.done: -// CHECK6-NEXT: br label [[OMP_PRECOND_END]] -// CHECK6: omp.precond.end: -// CHECK6-NEXT: ret void -// -// -// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 -// CHECK6-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK6-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK6-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK6-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK6-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK6: .execute: -// CHECK6-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK6-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] -// CHECK6-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK6: .omp.deinit: -// CHECK6-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK6-NEXT: br label [[DOTEXIT:%.*]] -// CHECK6: .exit: -// CHECK6-NEXT: ret void -// -// -// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__4 -// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 -// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK6-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK6-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK6-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK6-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 -// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 -// CHECK6-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK6: cond.true: -// CHECK6-NEXT: br label [[COND_END:%.*]] -// CHECK6: cond.false: -// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: br label [[COND_END]] -// CHECK6: cond.end: -// CHECK6-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] -// CHECK6-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK6-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK6: omp.inner.for.cond: -// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 -// CHECK6-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK6: omp.inner.for.body: -// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK6-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP7]] to i8* -// CHECK6-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 -// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP8]] to i8* -// CHECK6-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 -// CHECK6-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK6-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* -// CHECK6-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 -// CHECK6-NEXT: [[TMP15:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP15]], i32 3) -// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK6: omp.inner.for.inc: -// CHECK6-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP17]] -// CHECK6-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK6-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK6-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] -// CHECK6-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK6-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK6-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] -// CHECK6-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP22]], 9 -// CHECK6-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] -// CHECK6: cond.true5: -// CHECK6-NEXT: br label [[COND_END7:%.*]] -// CHECK6: cond.false6: -// CHECK6-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: br label [[COND_END7]] -// CHECK6: cond.end7: -// CHECK6-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP23]], [[COND_FALSE6]] ] -// CHECK6-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK6-NEXT: store i32 [[TMP24]], i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP15:![0-9]+]] -// CHECK6: omp.inner.for.end: -// CHECK6-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK6: omp.loop.exit: -// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) -// CHECK6-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK6-NEXT: [[TMP26:%.*]] = icmp ne i32 [[TMP25]], 0 -// CHECK6-NEXT: br i1 [[TMP26]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK6: .omp.final.then: -// CHECK6-NEXT: store i32 10, i32* [[I]], align 4 -// CHECK6-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK6: .omp.final.done: -// CHECK6-NEXT: ret void -// -// -// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__5 -// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK6-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK6-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK6-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK6-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK6-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 -// CHECK6-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 -// CHECK6-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 -// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK6-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 -// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK6-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK6: omp.inner.for.cond: -// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK6-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] -// CHECK6-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK6: omp.inner.for.body: -// CHECK6-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 -// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK6-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 -// CHECK6-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 [[TMP9]] -// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -// CHECK6-NEXT: [[ADD1:%.*]] = add nsw i32 [[TMP10]], 1 -// CHECK6-NEXT: store i32 [[ADD1]], i32* [[ARRAYIDX]], align 4 -// CHECK6-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK6: omp.body.continue: -// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK6: omp.inner.for.inc: -// CHECK6-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK6-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] -// CHECK6-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]] -// CHECK6: omp.inner.for.end: -// CHECK6-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK6: omp.loop.exit: -// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) -// CHECK6-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK6-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 -// CHECK6-NEXT: br i1 [[TMP14]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK6: .omp.final.then: -// CHECK6-NEXT: store i32 10, i32* [[I]], align 4 -// CHECK6-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK6: .omp.final.done: -// CHECK6-NEXT: ret void -// -// -// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 -// CHECK6-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK6-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK6-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 -// CHECK6-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK6-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK6-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK6: .execute: -// CHECK6-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[F_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP2]], i32* [[F_CASTED]], align 4 -// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[F_CASTED]], align 4 -// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK6-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i32 [[TMP3]]) #[[ATTR2]] -// CHECK6-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK6: .omp.deinit: -// CHECK6-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK6-NEXT: br label [[DOTEXIT:%.*]] -// CHECK6: .exit: -// CHECK6-NEXT: ret void -// -// -// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__6 -// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK6-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[K:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 -// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK6-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 -// CHECK6-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK6-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK6-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 -// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 -// CHECK6-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK6: cond.true: -// CHECK6-NEXT: br label [[COND_END:%.*]] -// CHECK6: cond.false: -// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: br label [[COND_END]] -// CHECK6: cond.end: -// CHECK6-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] -// CHECK6-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK6-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK6: omp.inner.for.cond: -// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 -// CHECK6-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK6: omp.inner.for.body: -// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK6-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[F_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP9]], i32* [[F_CASTED]], align 4 -// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[F_CASTED]], align 4 -// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP7]] to i8* -// CHECK6-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 -// CHECK6-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP14:%.*]] = inttoptr i32 [[TMP8]] to i8* -// CHECK6-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 -// CHECK6-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK6-NEXT: [[TMP16:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* -// CHECK6-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 4 -// CHECK6-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK6-NEXT: [[TMP18:%.*]] = inttoptr i32 [[TMP10]] to i8* -// CHECK6-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 4 -// CHECK6-NEXT: [[TMP19:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x [10 x i32]]*, i32)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP19]], i32 4) -// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK6: omp.inner.for.inc: -// CHECK6-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] -// CHECK6-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK6-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK6-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK6-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK6-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK6-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] -// CHECK6-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP26]], 99 -// CHECK6-NEXT: br i1 [[CMP5]], label [[COND_TRUE6:%.*]], label [[COND_FALSE7:%.*]] -// CHECK6: cond.true6: -// CHECK6-NEXT: br label [[COND_END8:%.*]] -// CHECK6: cond.false7: -// CHECK6-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: br label [[COND_END8]] -// CHECK6: cond.end8: -// CHECK6-NEXT: [[COND9:%.*]] = phi i32 [ 99, [[COND_TRUE6]] ], [ [[TMP27]], [[COND_FALSE7]] ] -// CHECK6-NEXT: store i32 [[COND9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK6-NEXT: store i32 [[TMP28]], i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]] -// CHECK6: omp.inner.for.end: -// CHECK6-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK6: omp.loop.exit: -// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) -// CHECK6-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK6-NEXT: [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0 -// CHECK6-NEXT: br i1 [[TMP30]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK6: .omp.final.then: -// CHECK6-NEXT: store i32 10, i32* [[I]], align 4 -// CHECK6-NEXT: store i32 10, i32* [[J]], align 4 -// CHECK6-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK6: .omp.final.done: -// CHECK6-NEXT: ret void -// -// -// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__7 -// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK6-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[K:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK6-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK6-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK6-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 -// CHECK6-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK6-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 -// CHECK6-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 -// CHECK6-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 -// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK6-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 -// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK6-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK6: omp.inner.for.cond: -// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK6-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] -// CHECK6-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK6: omp.inner.for.body: -// CHECK6-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 -// CHECK6-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 -// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK6-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[DIV2:%.*]] = sdiv i32 [[TMP10]], 10 -// CHECK6-NEXT: [[MUL3:%.*]] = mul nsw i32 [[DIV2]], 10 -// CHECK6-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL3]] -// CHECK6-NEXT: [[MUL4:%.*]] = mul nsw i32 [[SUB]], 1 -// CHECK6-NEXT: [[ADD5:%.*]] = add nsw i32 0, [[MUL4]] -// CHECK6-NEXT: store i32 [[ADD5]], i32* [[J]], align 4 -// CHECK6-NEXT: store i32 10, i32* [[K]], align 4 -// CHECK6-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 -// CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 -// CHECK6-NEXT: [[TMP13:%.*]] = load i32, i32* [[F_ADDR]], align 4 -// CHECK6-NEXT: [[MUL6:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] -// CHECK6-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP11]], [[MUL6]] -// CHECK6-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 -// CHECK6-NEXT: [[ADD8:%.*]] = add nsw i32 [[ADD7]], [[TMP14]] -// CHECK6-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 -// CHECK6-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP15]] -// CHECK6-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 -// CHECK6-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP16]] -// CHECK6-NEXT: store i32 [[ADD8]], i32* [[ARRAYIDX9]], align 4 -// CHECK6-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK6: omp.body.continue: -// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK6: omp.inner.for.inc: -// CHECK6-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK6-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] -// CHECK6-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP18:![0-9]+]] -// CHECK6: omp.inner.for.end: -// CHECK6-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK6: omp.loop.exit: -// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) -// CHECK6-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK6-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 -// CHECK6-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK6: .omp.final.then: -// CHECK6-NEXT: store i32 10, i32* [[I]], align 4 -// CHECK6-NEXT: store i32 10, i32* [[J]], align 4 -// CHECK6-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK6: .omp.final.done: -// CHECK6-NEXT: ret void -// -// // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38 // CHECK7-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK7-NEXT: entry: @@ -6500,8 +103,6 @@ // CHECK7-NEXT: br label [[DOTEXIT:%.*]] // CHECK7: .exit: // CHECK7-NEXT: ret void -// -// // CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__ // CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { // CHECK7-NEXT: entry: @@ -6673,8 +274,6 @@ // CHECK7-NEXT: [[TMP59:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 // CHECK7-NEXT: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[TMP59]]) // CHECK7-NEXT: ret void -// -// // CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__1 // CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { // CHECK7-NEXT: entry: @@ -6820,8 +419,6 @@ // CHECK7-NEXT: br label [[OMP_PRECOND_END]] // CHECK7: omp.precond.end: // CHECK7-NEXT: ret void -// -// // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44 // CHECK7-SAME: (i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { // CHECK7-NEXT: entry: @@ -6852,8 +449,6 @@ // CHECK7-NEXT: br label [[DOTEXIT:%.*]] // CHECK7: .exit: // CHECK7-NEXT: ret void -// -// // CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__2 // CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { // CHECK7-NEXT: entry: @@ -6998,8 +593,6 @@ // CHECK7-NEXT: br label [[OMP_PRECOND_END]] // CHECK7: omp.precond.end: // CHECK7-NEXT: ret void -// -// // CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__3 // CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { // CHECK7-NEXT: entry: @@ -7105,8 +698,6 @@ // CHECK7-NEXT: br label [[OMP_PRECOND_END]] // CHECK7: omp.precond.end: // CHECK7-NEXT: ret void -// -// // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 // CHECK7-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { // CHECK7-NEXT: entry: @@ -7129,8 +720,6 @@ // CHECK7-NEXT: br label [[DOTEXIT:%.*]] // CHECK7: .exit: // CHECK7-NEXT: ret void -// -// // CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__4 // CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { // CHECK7-NEXT: entry: @@ -7231,8 +820,6 @@ // CHECK7-NEXT: br label [[DOTOMP_FINAL_DONE]] // CHECK7: .omp.final.done: // CHECK7-NEXT: ret void -// -// // CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__5 // CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { // CHECK7-NEXT: entry: @@ -7308,8 +895,6 @@ // CHECK7-NEXT: br label [[DOTOMP_FINAL_DONE]] // CHECK7: .omp.final.done: // CHECK7-NEXT: ret void -// -// // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 // CHECK7-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { // CHECK7-NEXT: entry: @@ -7340,8 +925,6 @@ // CHECK7-NEXT: br label [[DOTEXIT:%.*]] // CHECK7: .exit: // CHECK7-NEXT: ret void -// -// // CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__6 // CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { // CHECK7-NEXT: entry: @@ -7457,8 +1040,6 @@ // CHECK7-NEXT: br label [[DOTOMP_FINAL_DONE]] // CHECK7: .omp.final.done: // CHECK7-NEXT: ret void -// -// // CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__7 // CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { // CHECK7-NEXT: entry: @@ -7559,8 +1140,6 @@ // CHECK7-NEXT: br label [[DOTOMP_FINAL_DONE]] // CHECK7: .omp.final.done: // CHECK7-NEXT: ret void -// -// // CHECK8-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38 // CHECK8-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK8-NEXT: entry: @@ -7599,8 +1178,6 @@ // CHECK8-NEXT: br label [[DOTEXIT:%.*]] // CHECK8: .exit: // CHECK8-NEXT: ret void -// -// // CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__ // CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { // CHECK8-NEXT: entry: @@ -7767,8 +1344,6 @@ // CHECK8: omp.precond.end: // CHECK8-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP1]]) // CHECK8-NEXT: ret void -// -// // CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__1 // CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { // CHECK8-NEXT: entry: @@ -7914,8 +1489,6 @@ // CHECK8-NEXT: br label [[OMP_PRECOND_END]] // CHECK8: omp.precond.end: // CHECK8-NEXT: ret void -// -// // CHECK8-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44 // CHECK8-SAME: (i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { // CHECK8-NEXT: entry: @@ -7946,8 +1519,6 @@ // CHECK8-NEXT: br label [[DOTEXIT:%.*]] // CHECK8: .exit: // CHECK8-NEXT: ret void -// -// // CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__2 // CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { // CHECK8-NEXT: entry: @@ -8092,8 +1663,6 @@ // CHECK8-NEXT: br label [[OMP_PRECOND_END]] // CHECK8: omp.precond.end: // CHECK8-NEXT: ret void -// -// // CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__3 // CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { // CHECK8-NEXT: entry: @@ -8199,8 +1768,6 @@ // CHECK8-NEXT: br label [[OMP_PRECOND_END]] // CHECK8: omp.precond.end: // CHECK8-NEXT: ret void -// -// // CHECK8-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 // CHECK8-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { // CHECK8-NEXT: entry: @@ -8223,8 +1790,6 @@ // CHECK8-NEXT: br label [[DOTEXIT:%.*]] // CHECK8: .exit: // CHECK8-NEXT: ret void -// -// // CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__4 // CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { // CHECK8-NEXT: entry: @@ -8325,8 +1890,6 @@ // CHECK8-NEXT: br label [[DOTOMP_FINAL_DONE]] // CHECK8: .omp.final.done: // CHECK8-NEXT: ret void -// -// // CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__5 // CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { // CHECK8-NEXT: entry: @@ -8385,4470 +1948,10736 @@ // CHECK8: omp.body.continue: // CHECK8-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] // CHECK8: omp.inner.for.inc: -// CHECK8-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK8-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] -// CHECK8-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]] +// CHECK8-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] +// CHECK8-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]] +// CHECK8: omp.inner.for.end: +// CHECK8-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK8: omp.loop.exit: +// CHECK8-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK8-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK8-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 +// CHECK8-NEXT: br i1 [[TMP14]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK8: .omp.final.then: +// CHECK8-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK8-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK8: .omp.final.done: +// CHECK8-NEXT: ret void +// CHECK8-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 +// CHECK8-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { +// CHECK8-NEXT: entry: +// CHECK8-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK8-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK8-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK8-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 +// CHECK8-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK8-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* +// CHECK8-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK8-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK8-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK8: .execute: +// CHECK8-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK8-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK8-NEXT: [[CONV1:%.*]] = bitcast i64* [[F_CASTED]] to i32* +// CHECK8-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 +// CHECK8-NEXT: [[TMP3:%.*]] = load i64, i64* [[F_CASTED]], align 8 +// CHECK8-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK8-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i64 [[TMP3]]) #[[ATTR2]] +// CHECK8-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK8: .omp.deinit: +// CHECK8-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK8-NEXT: br label [[DOTEXIT:%.*]] +// CHECK8: .exit: +// CHECK8-NEXT: ret void +// CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__6 +// CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { +// CHECK8-NEXT: entry: +// CHECK8-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK8-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK8-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK8-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 +// CHECK8-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK8-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK8-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 +// CHECK8-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK8-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* +// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK8-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK8-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK8-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK8-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 +// CHECK8-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK8: cond.true: +// CHECK8-NEXT: br label [[COND_END:%.*]] +// CHECK8: cond.false: +// CHECK8-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: br label [[COND_END]] +// CHECK8: cond.end: +// CHECK8-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK8-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK8: omp.inner.for.cond: +// CHECK8-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 +// CHECK8-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK8: omp.inner.for.body: +// CHECK8-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK8-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK8-NEXT: [[TMP11:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK8-NEXT: [[CONV3:%.*]] = bitcast i64* [[F_CASTED]] to i32* +// CHECK8-NEXT: store i32 [[TMP11]], i32* [[CONV3]], align 4 +// CHECK8-NEXT: [[TMP12:%.*]] = load i64, i64* [[F_CASTED]], align 8 +// CHECK8-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK8-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP8]] to i8* +// CHECK8-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 +// CHECK8-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK8-NEXT: [[TMP16:%.*]] = inttoptr i64 [[TMP10]] to i8* +// CHECK8-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 +// CHECK8-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK8-NEXT: [[TMP18:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK8-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 8 +// CHECK8-NEXT: [[TMP19:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK8-NEXT: [[TMP20:%.*]] = inttoptr i64 [[TMP12]] to i8* +// CHECK8-NEXT: store i8* [[TMP20]], i8** [[TMP19]], align 8 +// CHECK8-NEXT: [[TMP21:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK8-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, [10 x [10 x i32]]*, i64)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP21]], i64 4) +// CHECK8-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK8: omp.inner.for.inc: +// CHECK8-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK8-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK8-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP26]], [[TMP27]] +// CHECK8-NEXT: store i32 [[ADD5]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP28]], 99 +// CHECK8-NEXT: br i1 [[CMP6]], label [[COND_TRUE7:%.*]], label [[COND_FALSE8:%.*]] +// CHECK8: cond.true7: +// CHECK8-NEXT: br label [[COND_END9:%.*]] +// CHECK8: cond.false8: +// CHECK8-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: br label [[COND_END9]] +// CHECK8: cond.end9: +// CHECK8-NEXT: [[COND10:%.*]] = phi i32 [ 99, [[COND_TRUE7]] ], [ [[TMP29]], [[COND_FALSE8]] ] +// CHECK8-NEXT: store i32 [[COND10]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK8-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK8-NEXT: store i32 [[TMP30]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]] +// CHECK8: omp.inner.for.end: +// CHECK8-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK8: omp.loop.exit: +// CHECK8-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK8-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK8-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0 +// CHECK8-NEXT: br i1 [[TMP32]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK8: .omp.final.then: +// CHECK8-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK8-NEXT: store i32 10, i32* [[J]], align 4 +// CHECK8-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK8: .omp.final.done: +// CHECK8-NEXT: ret void +// CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__7 +// CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { +// CHECK8-NEXT: entry: +// CHECK8-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK8-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK8-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK8-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 +// CHECK8-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK8-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK8-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK8-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK8-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 +// CHECK8-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK8-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* +// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK8-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 +// CHECK8-NEXT: [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK8-NEXT: [[CONV2:%.*]] = trunc i64 [[TMP1]] to i32 +// CHECK8-NEXT: [[TMP2:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK8-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP2]] to i32 +// CHECK8-NEXT: store i32 [[CONV2]], i32* [[DOTOMP_LB]], align 4 +// CHECK8-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_UB]], align 4 +// CHECK8-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK8-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK8-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK8-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK8-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK8-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK8: omp.inner.for.cond: +// CHECK8-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[CONV4:%.*]] = sext i32 [[TMP6]] to i64 +// CHECK8-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK8-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV4]], [[TMP7]] +// CHECK8-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK8: omp.inner.for.body: +// CHECK8-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 +// CHECK8-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 +// CHECK8-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK8-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK8-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[DIV5:%.*]] = sdiv i32 [[TMP10]], 10 +// CHECK8-NEXT: [[MUL6:%.*]] = mul nsw i32 [[DIV5]], 10 +// CHECK8-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL6]] +// CHECK8-NEXT: [[MUL7:%.*]] = mul nsw i32 [[SUB]], 1 +// CHECK8-NEXT: [[ADD8:%.*]] = add nsw i32 0, [[MUL7]] +// CHECK8-NEXT: store i32 [[ADD8]], i32* [[J]], align 4 +// CHECK8-NEXT: store i32 10, i32* [[K]], align 4 +// CHECK8-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 +// CHECK8-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 +// CHECK8-NEXT: [[TMP13:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK8-NEXT: [[MUL9:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] +// CHECK8-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP11]], [[MUL9]] +// CHECK8-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 +// CHECK8-NEXT: [[ADD11:%.*]] = add nsw i32 [[ADD10]], [[TMP14]] +// CHECK8-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 +// CHECK8-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP15]] to i64 +// CHECK8-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK8-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 +// CHECK8-NEXT: [[IDXPROM12:%.*]] = sext i32 [[TMP16]] to i64 +// CHECK8-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i64 0, i64 [[IDXPROM12]] +// CHECK8-NEXT: store i32 [[ADD11]], i32* [[ARRAYIDX13]], align 4 +// CHECK8-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK8: omp.body.continue: +// CHECK8-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK8: omp.inner.for.inc: +// CHECK8-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK8-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK8-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_IV]], align 4 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP18:![0-9]+]] // CHECK8: omp.inner.for.end: // CHECK8-NEXT: br label [[OMP_LOOP_EXIT:%.*]] // CHECK8: omp.loop.exit: // CHECK8-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) -// CHECK8-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK8-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 -// CHECK8-NEXT: br i1 [[TMP14]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK8-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK8-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 +// CHECK8-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] // CHECK8: .omp.final.then: // CHECK8-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK8-NEXT: store i32 10, i32* [[J]], align 4 // CHECK8-NEXT: br label [[DOTOMP_FINAL_DONE]] // CHECK8: .omp.final.done: // CHECK8-NEXT: ret void +// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38 +// CHECK9-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK9-NEXT: entry: +// CHECK9-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK9-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK9-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK9-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK9-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK9-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK9-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK9-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK9-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK9: .execute: +// CHECK9-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK9-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK9-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK9-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK9-NEXT: [[TMP4:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK9-NEXT: store i32 [[TMP4]], i32* [[L_CASTED]], align 4 +// CHECK9-NEXT: [[TMP5:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK9-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK9-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32 [[TMP5]]) #[[ATTR2:[0-9]+]] +// CHECK9-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK9: .omp.deinit: +// CHECK9-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK9-NEXT: br label [[DOTEXIT:%.*]] +// CHECK9: .exit: +// CHECK9-NEXT: ret void +// CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK9-NEXT: entry: +// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK9-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK9-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[I4:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 +// CHECK9-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK9-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK9-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK9-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK9-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK9-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK9-NEXT: [[TMP1:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK9-NEXT: [[TMP2:%.*]] = load i32, i32* @"_openmp_static_kernel$size", align 4 +// CHECK9-NEXT: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP2]], i16 [[TMP1]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) +// CHECK9-NEXT: [[TMP3:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4 +// CHECK9-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP3]], i32 0 +// CHECK9-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to %struct._globalized_locals_ty* +// CHECK9-NEXT: [[L1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP5]], i32 0, i32 0 +// CHECK9-NEXT: [[TMP6:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK9-NEXT: store i32 [[TMP6]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP7]], 0 +// CHECK9-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK9-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK9-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK9-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK9-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP8]] +// CHECK9-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK9: omp.precond.then: +// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK9-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK9-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK9-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK9-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP11]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) +// CHECK9-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK9-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]] +// CHECK9-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK9: cond.true: +// CHECK9-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK9-NEXT: br label [[COND_END:%.*]] +// CHECK9: cond.false: +// CHECK9-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: br label [[COND_END]] +// CHECK9: cond.end: +// CHECK9-NEXT: [[COND:%.*]] = phi i32 [ [[TMP14]], [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ] +// CHECK9-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: store i32 [[TMP16]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK9: omp.inner.for.cond: +// CHECK9-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], 1 +// CHECK9-NEXT: [[CMP6:%.*]] = icmp slt i32 [[TMP17]], [[ADD]] +// CHECK9-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK9: omp.inner.for.body: +// CHECK9-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP21:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK9-NEXT: store i32 [[TMP21]], i32* [[N_CASTED]], align 4 +// CHECK9-NEXT: [[TMP22:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK9-NEXT: [[TMP23:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK9-NEXT: store i32 [[TMP23]], i32* [[L_CASTED]], align 4 +// CHECK9-NEXT: [[TMP24:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK9-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK9-NEXT: [[TMP26:%.*]] = inttoptr i32 [[TMP19]] to i8* +// CHECK9-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 +// CHECK9-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK9-NEXT: [[TMP28:%.*]] = inttoptr i32 [[TMP20]] to i8* +// CHECK9-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 +// CHECK9-NEXT: [[TMP29:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK9-NEXT: [[TMP30:%.*]] = inttoptr i32 [[TMP22]] to i8* +// CHECK9-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 4 +// CHECK9-NEXT: [[TMP31:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK9-NEXT: [[TMP32:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK9-NEXT: store i8* [[TMP32]], i8** [[TMP31]], align 4 +// CHECK9-NEXT: [[TMP33:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK9-NEXT: [[TMP34:%.*]] = inttoptr i32 [[TMP24]] to i8* +// CHECK9-NEXT: store i8* [[TMP34]], i8** [[TMP33]], align 4 +// CHECK9-NEXT: [[TMP35:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK9-NEXT: [[TMP36:%.*]] = load i32, i32* [[TMP35]], align 4 +// CHECK9-NEXT: [[TMP37:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK9-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP36]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP37]], i32 5) +// CHECK9-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK9: omp.inner.for.inc: +// CHECK9-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP38]], [[TMP39]] +// CHECK9-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP40]], [[TMP41]] +// CHECK9-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP42]], [[TMP43]] +// CHECK9-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK9-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP44]], [[TMP45]] +// CHECK9-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK9: cond.true11: +// CHECK9-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK9-NEXT: br label [[COND_END13:%.*]] +// CHECK9: cond.false12: +// CHECK9-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: br label [[COND_END13]] +// CHECK9: cond.end13: +// CHECK9-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP46]], [[COND_TRUE11]] ], [ [[TMP47]], [[COND_FALSE12]] ] +// CHECK9-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: store i32 [[TMP48]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP10:![0-9]+]] +// CHECK9: omp.inner.for.end: +// CHECK9-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK9: omp.loop.exit: +// CHECK9-NEXT: [[TMP49:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK9-NEXT: [[TMP50:%.*]] = load i32, i32* [[TMP49]], align 4 +// CHECK9-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP50]]) +// CHECK9-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK9-NEXT: [[TMP52:%.*]] = icmp ne i32 [[TMP51]], 0 +// CHECK9-NEXT: br i1 [[TMP52]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK9: .omp.final.then: +// CHECK9-NEXT: [[TMP53:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP53]], 0 +// CHECK9-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1 +// CHECK9-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV16]], 1 +// CHECK9-NEXT: [[ADD17:%.*]] = add nsw i32 0, [[MUL]] +// CHECK9-NEXT: store i32 [[ADD17]], i32* [[I4]], align 4 +// CHECK9-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK9: .omp.final.done: +// CHECK9-NEXT: [[TMP54:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK9-NEXT: [[TMP55:%.*]] = icmp ne i32 [[TMP54]], 0 +// CHECK9-NEXT: br i1 [[TMP55]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK9: .omp.lastprivate.then: +// CHECK9-NEXT: [[TMP56:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK9-NEXT: store i32 [[TMP56]], i32* [[L_ADDR]], align 4 +// CHECK9-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK9: .omp.lastprivate.done: +// CHECK9-NEXT: br label [[OMP_PRECOND_END]] +// CHECK9: omp.precond.end: +// CHECK9-NEXT: [[TMP57:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK9-NEXT: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[TMP57]]) +// CHECK9-NEXT: ret void +// CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK9-NEXT: entry: +// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK9-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK9-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK9-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK9-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK9-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK9-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK9-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK9-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK9-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK9-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK9-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK9-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK9-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK9-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK9-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK9-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK9-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK9: omp.precond.then: +// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK9-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK9-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK9-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK9-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK9-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK9-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK9-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK9-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK9-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK9-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) +// CHECK9-NEXT: br label [[OMP_DISPATCH_COND:%.*]] +// CHECK9: omp.dispatch.cond: +// CHECK9-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK9-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK9-NEXT: [[CMP4:%.*]] = icmp ugt i32 [[TMP9]], [[TMP10]] +// CHECK9-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK9: cond.true: +// CHECK9-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK9-NEXT: br label [[COND_END:%.*]] +// CHECK9: cond.false: +// CHECK9-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK9-NEXT: br label [[COND_END]] +// CHECK9: cond.end: +// CHECK9-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] +// CHECK9-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 +// CHECK9-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK9-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK9-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] +// CHECK9-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] +// CHECK9: omp.dispatch.body: +// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK9: omp.inner.for.cond: +// CHECK9-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK9-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] +// CHECK9-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK9: omp.inner.for.body: +// CHECK9-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK9-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK9-NEXT: [[TMP19:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK9-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP19]] +// CHECK9-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 +// CHECK9-NEXT: [[TMP20:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK9-NEXT: store i32 [[TMP20]], i32* [[L_ADDR]], align 4 +// CHECK9-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK9: omp.body.continue: +// CHECK9-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK9: omp.inner.for.inc: +// CHECK9-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP21]], 1 +// CHECK9-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP12:![0-9]+]] +// CHECK9: omp.inner.for.end: +// CHECK9-NEXT: br label [[OMP_DISPATCH_INC:%.*]] +// CHECK9: omp.dispatch.inc: +// CHECK9-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK9-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK9-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_LB]], align 4 +// CHECK9-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK9-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK9-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_UB]], align 4 +// CHECK9-NEXT: br label [[OMP_DISPATCH_COND]] +// CHECK9: omp.dispatch.end: +// CHECK9-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK9-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK9-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) +// CHECK9-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK9-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 +// CHECK9-NEXT: br i1 [[TMP29]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK9: .omp.final.then: +// CHECK9-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[SUB10:%.*]] = sub nsw i32 [[TMP30]], 0 +// CHECK9-NEXT: [[DIV11:%.*]] = sdiv i32 [[SUB10]], 1 +// CHECK9-NEXT: [[MUL12:%.*]] = mul nsw i32 [[DIV11]], 1 +// CHECK9-NEXT: [[ADD13:%.*]] = add nsw i32 0, [[MUL12]] +// CHECK9-NEXT: store i32 [[ADD13]], i32* [[I3]], align 4 +// CHECK9-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK9: .omp.final.done: +// CHECK9-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK9-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0 +// CHECK9-NEXT: br i1 [[TMP32]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK9: .omp.lastprivate.then: +// CHECK9-NEXT: [[TMP33:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK9-NEXT: store i32 [[TMP33]], i32* [[L_ADDR]], align 4 +// CHECK9-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK9: .omp.lastprivate.done: +// CHECK9-NEXT: br label [[OMP_PRECOND_END]] +// CHECK9: omp.precond.end: +// CHECK9-NEXT: ret void +// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44 +// CHECK9-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK9-NEXT: entry: +// CHECK9-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK9-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK9-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK9-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK9-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK9-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK9-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK9-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK9: .execute: +// CHECK9-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK9-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK9-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK9-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK9-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK9-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] +// CHECK9-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK9: .omp.deinit: +// CHECK9-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK9-NEXT: br label [[DOTEXIT:%.*]] +// CHECK9: .exit: +// CHECK9-NEXT: ret void +// CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK9-NEXT: entry: +// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK9-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK9-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK9-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK9-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK9-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK9-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK9-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK9-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK9-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK9-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK9-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK9-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK9-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK9-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK9-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK9: omp.precond.then: +// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK9-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK9-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK9-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK9-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK9-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK9-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK9-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK9-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK9: cond.true: +// CHECK9-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK9-NEXT: br label [[COND_END:%.*]] +// CHECK9: cond.false: +// CHECK9-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: br label [[COND_END]] +// CHECK9: cond.end: +// CHECK9-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK9-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK9: omp.inner.for.cond: +// CHECK9-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK9-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK9-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK9: omp.inner.for.body: +// CHECK9-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK9-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 +// CHECK9-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK9-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK9-NEXT: [[TMP19:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK9-NEXT: store i8* [[TMP19]], i8** [[TMP18]], align 4 +// CHECK9-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK9-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK9-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 +// CHECK9-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK9-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK9-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 +// CHECK9-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK9-NEXT: [[TMP25:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* +// CHECK9-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 +// CHECK9-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK9-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK9-NEXT: [[TMP28:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK9-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP27]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP28]], i32 4) +// CHECK9-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK9: omp.inner.for.inc: +// CHECK9-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP29]], [[TMP30]] +// CHECK9-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] +// CHECK9-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK9-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK9-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] +// CHECK9-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] +// CHECK9: cond.true10: +// CHECK9-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK9-NEXT: br label [[COND_END12:%.*]] +// CHECK9: cond.false11: +// CHECK9-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: br label [[COND_END12]] +// CHECK9: cond.end12: +// CHECK9-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP37]], [[COND_TRUE10]] ], [ [[TMP38]], [[COND_FALSE11]] ] +// CHECK9-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: store i32 [[TMP39]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]] +// CHECK9: omp.inner.for.end: +// CHECK9-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK9: omp.loop.exit: +// CHECK9-NEXT: [[TMP40:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK9-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 +// CHECK9-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]]) +// CHECK9-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK9-NEXT: [[TMP43:%.*]] = icmp ne i32 [[TMP42]], 0 +// CHECK9-NEXT: br i1 [[TMP43]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK9: .omp.final.then: +// CHECK9-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[SUB14:%.*]] = sub nsw i32 [[TMP44]], 0 +// CHECK9-NEXT: [[DIV15:%.*]] = sdiv i32 [[SUB14]], 1 +// CHECK9-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV15]], 1 +// CHECK9-NEXT: [[ADD16:%.*]] = add nsw i32 0, [[MUL]] +// CHECK9-NEXT: store i32 [[ADD16]], i32* [[I3]], align 4 +// CHECK9-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK9: .omp.final.done: +// CHECK9-NEXT: br label [[OMP_PRECOND_END]] +// CHECK9: omp.precond.end: +// CHECK9-NEXT: ret void +// CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK9-NEXT: entry: +// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK9-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK9-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK9-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK9-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK9-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK9-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK9-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK9-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK9-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK9-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK9-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK9-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK9-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK9-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK9-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK9-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK9: omp.precond.then: +// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK9-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK9-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK9-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK9-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK9-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK9-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK9-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK9-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK9-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK9-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK9-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK9-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK9: omp.inner.for.cond: +// CHECK9-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK9-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] +// CHECK9-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK9: omp.inner.for.body: +// CHECK9-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK9-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK9-NEXT: [[TMP13:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK9-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i32 0, i32 [[TMP13]] +// CHECK9-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +// CHECK9-NEXT: [[CONV:%.*]] = sext i16 [[TMP14]] to i32 +// CHECK9-NEXT: [[ADD5:%.*]] = add nsw i32 [[CONV]], 1 +// CHECK9-NEXT: [[CONV6:%.*]] = trunc i32 [[ADD5]] to i16 +// CHECK9-NEXT: store i16 [[CONV6]], i16* [[ARRAYIDX]], align 2 +// CHECK9-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK9: omp.body.continue: +// CHECK9-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK9: omp.inner.for.inc: +// CHECK9-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] +// CHECK9-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP14:![0-9]+]] +// CHECK9: omp.inner.for.end: +// CHECK9-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK9: omp.loop.exit: +// CHECK9-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK9-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 +// CHECK9-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) +// CHECK9-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK9-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 +// CHECK9-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK9: .omp.final.then: +// CHECK9-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP21]], 0 +// CHECK9-NEXT: [[DIV9:%.*]] = sdiv i32 [[SUB8]], 1 +// CHECK9-NEXT: [[MUL10:%.*]] = mul nsw i32 [[DIV9]], 1 +// CHECK9-NEXT: [[ADD11:%.*]] = add nsw i32 0, [[MUL10]] +// CHECK9-NEXT: store i32 [[ADD11]], i32* [[I3]], align 4 +// CHECK9-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK9: .omp.final.done: +// CHECK9-NEXT: br label [[OMP_PRECOND_END]] +// CHECK9: omp.precond.end: +// CHECK9-NEXT: ret void +// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 +// CHECK9-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK9-NEXT: entry: +// CHECK9-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK9-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK9-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK9-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK9-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK9-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK9-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK9: .execute: +// CHECK9-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK9-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK9-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] +// CHECK9-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK9: .omp.deinit: +// CHECK9-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK9-NEXT: br label [[DOTEXIT:%.*]] +// CHECK9: .exit: +// CHECK9-NEXT: ret void +// CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK9-NEXT: entry: +// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK9-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK9-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 +// CHECK9-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK9-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK9-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK9-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK9-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK9-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK9-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK9-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK9-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 +// CHECK9-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK9: cond.true: +// CHECK9-NEXT: br label [[COND_END:%.*]] +// CHECK9: cond.false: +// CHECK9-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: br label [[COND_END]] +// CHECK9: cond.end: +// CHECK9-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK9-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK9: omp.inner.for.cond: +// CHECK9-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 +// CHECK9-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK9: omp.inner.for.body: +// CHECK9-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK9-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK9-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK9-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK9-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK9-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK9-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK9-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK9-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK9-NEXT: [[TMP15:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK9-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP15]], i32 3) +// CHECK9-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK9: omp.inner.for.inc: +// CHECK9-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP17]] +// CHECK9-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] +// CHECK9-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK9-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP22]], 9 +// CHECK9-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] +// CHECK9: cond.true5: +// CHECK9-NEXT: br label [[COND_END7:%.*]] +// CHECK9: cond.false6: +// CHECK9-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: br label [[COND_END7]] +// CHECK9: cond.end7: +// CHECK9-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP23]], [[COND_FALSE6]] ] +// CHECK9-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: store i32 [[TMP24]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP15:![0-9]+]] +// CHECK9: omp.inner.for.end: +// CHECK9-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK9: omp.loop.exit: +// CHECK9-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK9-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK9-NEXT: [[TMP26:%.*]] = icmp ne i32 [[TMP25]], 0 +// CHECK9-NEXT: br i1 [[TMP26]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK9: .omp.final.then: +// CHECK9-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK9-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK9: .omp.final.done: +// CHECK9-NEXT: ret void +// CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__5 +// CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK9-NEXT: entry: +// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK9-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK9-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK9-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK9-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK9-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK9-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK9-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK9-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK9-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK9-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK9-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK9-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK9-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK9-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK9-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK9-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK9-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK9-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK9: omp.inner.for.cond: +// CHECK9-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK9-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK9-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK9: omp.inner.for.body: +// CHECK9-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 +// CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK9-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK9-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 +// CHECK9-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 [[TMP9]] +// CHECK9-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK9-NEXT: [[ADD1:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK9-NEXT: store i32 [[ADD1]], i32* [[ARRAYIDX]], align 4 +// CHECK9-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK9: omp.body.continue: +// CHECK9-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK9: omp.inner.for.inc: +// CHECK9-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] +// CHECK9-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]] +// CHECK9: omp.inner.for.end: +// CHECK9-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK9: omp.loop.exit: +// CHECK9-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK9-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK9-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 +// CHECK9-NEXT: br i1 [[TMP14]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK9: .omp.final.then: +// CHECK9-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK9-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK9: .omp.final.done: +// CHECK9-NEXT: ret void +// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 +// CHECK9-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK9-NEXT: entry: +// CHECK9-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK9-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK9-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK9-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK9-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK9-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK9-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK9-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK9: .execute: +// CHECK9-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK9-NEXT: [[TMP2:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK9-NEXT: store i32 [[TMP2]], i32* [[F_CASTED]], align 4 +// CHECK9-NEXT: [[TMP3:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK9-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK9-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i32 [[TMP3]]) #[[ATTR2]] +// CHECK9-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK9: .omp.deinit: +// CHECK9-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK9-NEXT: br label [[DOTEXIT:%.*]] +// CHECK9: .exit: +// CHECK9-NEXT: ret void +// CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__6 +// CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK9-NEXT: entry: +// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK9-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK9-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK9-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK9-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK9-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK9-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK9-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK9-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK9-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK9-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK9-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK9-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 +// CHECK9-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK9: cond.true: +// CHECK9-NEXT: br label [[COND_END:%.*]] +// CHECK9: cond.false: +// CHECK9-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: br label [[COND_END]] +// CHECK9: cond.end: +// CHECK9-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK9-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK9: omp.inner.for.cond: +// CHECK9-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 +// CHECK9-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK9: omp.inner.for.body: +// CHECK9-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP9:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK9-NEXT: store i32 [[TMP9]], i32* [[F_CASTED]], align 4 +// CHECK9-NEXT: [[TMP10:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK9-NEXT: [[TMP11:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK9-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK9-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK9-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK9-NEXT: [[TMP14:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK9-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK9-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK9-NEXT: [[TMP16:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK9-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 4 +// CHECK9-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK9-NEXT: [[TMP18:%.*]] = inttoptr i32 [[TMP10]] to i8* +// CHECK9-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 4 +// CHECK9-NEXT: [[TMP19:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK9-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x [10 x i32]]*, i32)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP19]], i32 4) +// CHECK9-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK9: omp.inner.for.inc: +// CHECK9-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK9-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK9-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK9-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP26]], 99 +// CHECK9-NEXT: br i1 [[CMP5]], label [[COND_TRUE6:%.*]], label [[COND_FALSE7:%.*]] +// CHECK9: cond.true6: +// CHECK9-NEXT: br label [[COND_END8:%.*]] +// CHECK9: cond.false7: +// CHECK9-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: br label [[COND_END8]] +// CHECK9: cond.end8: +// CHECK9-NEXT: [[COND9:%.*]] = phi i32 [ 99, [[COND_TRUE6]] ], [ [[TMP27]], [[COND_FALSE7]] ] +// CHECK9-NEXT: store i32 [[COND9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: store i32 [[TMP28]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]] +// CHECK9: omp.inner.for.end: +// CHECK9-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK9: omp.loop.exit: +// CHECK9-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK9-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK9-NEXT: [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0 +// CHECK9-NEXT: br i1 [[TMP30]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK9: .omp.final.then: +// CHECK9-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK9-NEXT: store i32 10, i32* [[J]], align 4 +// CHECK9-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK9: .omp.final.done: +// CHECK9-NEXT: ret void +// CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__7 +// CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK9-NEXT: entry: +// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK9-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK9-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK9-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK9-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK9-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK9-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK9-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK9-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK9-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 +// CHECK9-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK9-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK9-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK9-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK9-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK9-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK9-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK9-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK9-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK9-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK9: omp.inner.for.cond: +// CHECK9-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK9-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK9-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK9: omp.inner.for.body: +// CHECK9-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 +// CHECK9-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 +// CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK9-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK9-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[DIV2:%.*]] = sdiv i32 [[TMP10]], 10 +// CHECK9-NEXT: [[MUL3:%.*]] = mul nsw i32 [[DIV2]], 10 +// CHECK9-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL3]] +// CHECK9-NEXT: [[MUL4:%.*]] = mul nsw i32 [[SUB]], 1 +// CHECK9-NEXT: [[ADD5:%.*]] = add nsw i32 0, [[MUL4]] +// CHECK9-NEXT: store i32 [[ADD5]], i32* [[J]], align 4 +// CHECK9-NEXT: store i32 10, i32* [[K]], align 4 +// CHECK9-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 +// CHECK9-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 +// CHECK9-NEXT: [[TMP13:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK9-NEXT: [[MUL6:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] +// CHECK9-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP11]], [[MUL6]] +// CHECK9-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 +// CHECK9-NEXT: [[ADD8:%.*]] = add nsw i32 [[ADD7]], [[TMP14]] +// CHECK9-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 +// CHECK9-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP15]] +// CHECK9-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 +// CHECK9-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP16]] +// CHECK9-NEXT: store i32 [[ADD8]], i32* [[ARRAYIDX9]], align 4 +// CHECK9-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK9: omp.body.continue: +// CHECK9-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK9: omp.inner.for.inc: +// CHECK9-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK9-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP18:![0-9]+]] +// CHECK9: omp.inner.for.end: +// CHECK9-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK9: omp.loop.exit: +// CHECK9-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK9-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK9-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 +// CHECK9-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK9: .omp.final.then: +// CHECK9-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK9-NEXT: store i32 10, i32* [[J]], align 4 +// CHECK9-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK9: .omp.final.done: +// CHECK9-NEXT: ret void +// CHECK10-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38 +// CHECK10-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK10-NEXT: entry: +// CHECK10-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK10-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK10-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK10-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK10-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK10-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK10-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK10-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK10-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK10: .execute: +// CHECK10-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK10-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK10-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK10-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK10-NEXT: [[TMP4:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK10-NEXT: store i32 [[TMP4]], i32* [[L_CASTED]], align 4 +// CHECK10-NEXT: [[TMP5:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK10-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK10-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32 [[TMP5]]) #[[ATTR2:[0-9]+]] +// CHECK10-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK10: .omp.deinit: +// CHECK10-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK10-NEXT: br label [[DOTEXIT:%.*]] +// CHECK10: .exit: +// CHECK10-NEXT: ret void +// CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK10-NEXT: entry: +// CHECK10-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK10-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK10-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK10-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[I4:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 +// CHECK10-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK10-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK10-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK10-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK10-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK10-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK10-NEXT: [[TMP1:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK10-NEXT: [[TMP2:%.*]] = load i32, i32* @"_openmp_static_kernel$size", align 4 +// CHECK10-NEXT: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP2]], i16 [[TMP1]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) +// CHECK10-NEXT: [[TMP3:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4 +// CHECK10-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP3]], i32 0 +// CHECK10-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to %struct._globalized_locals_ty* +// CHECK10-NEXT: [[L1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP5]], i32 0, i32 0 +// CHECK10-NEXT: [[TMP6:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK10-NEXT: store i32 [[TMP6]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK10-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK10-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP7]], 0 +// CHECK10-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK10-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK10-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK10-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK10-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK10-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP8]] +// CHECK10-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK10: omp.precond.then: +// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK10-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK10-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK10-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK10-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP11]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) +// CHECK10-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK10-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]] +// CHECK10-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK10: cond.true: +// CHECK10-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK10-NEXT: br label [[COND_END:%.*]] +// CHECK10: cond.false: +// CHECK10-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: br label [[COND_END]] +// CHECK10: cond.end: +// CHECK10-NEXT: [[COND:%.*]] = phi i32 [ [[TMP14]], [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ] +// CHECK10-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: store i32 [[TMP16]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK10: omp.inner.for.cond: +// CHECK10-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK10-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], 1 +// CHECK10-NEXT: [[CMP6:%.*]] = icmp slt i32 [[TMP17]], [[ADD]] +// CHECK10-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK10: omp.inner.for.body: +// CHECK10-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP21:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK10-NEXT: store i32 [[TMP21]], i32* [[N_CASTED]], align 4 +// CHECK10-NEXT: [[TMP22:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK10-NEXT: [[TMP23:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK10-NEXT: store i32 [[TMP23]], i32* [[L_CASTED]], align 4 +// CHECK10-NEXT: [[TMP24:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK10-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK10-NEXT: [[TMP26:%.*]] = inttoptr i32 [[TMP19]] to i8* +// CHECK10-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 +// CHECK10-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK10-NEXT: [[TMP28:%.*]] = inttoptr i32 [[TMP20]] to i8* +// CHECK10-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 +// CHECK10-NEXT: [[TMP29:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK10-NEXT: [[TMP30:%.*]] = inttoptr i32 [[TMP22]] to i8* +// CHECK10-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 4 +// CHECK10-NEXT: [[TMP31:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK10-NEXT: [[TMP32:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK10-NEXT: store i8* [[TMP32]], i8** [[TMP31]], align 4 +// CHECK10-NEXT: [[TMP33:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK10-NEXT: [[TMP34:%.*]] = inttoptr i32 [[TMP24]] to i8* +// CHECK10-NEXT: store i8* [[TMP34]], i8** [[TMP33]], align 4 +// CHECK10-NEXT: [[TMP35:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK10-NEXT: [[TMP36:%.*]] = load i32, i32* [[TMP35]], align 4 +// CHECK10-NEXT: [[TMP37:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK10-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP36]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP37]], i32 5) +// CHECK10-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK10: omp.inner.for.inc: +// CHECK10-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP38]], [[TMP39]] +// CHECK10-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP40]], [[TMP41]] +// CHECK10-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP42]], [[TMP43]] +// CHECK10-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK10-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP44]], [[TMP45]] +// CHECK10-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK10: cond.true11: +// CHECK10-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK10-NEXT: br label [[COND_END13:%.*]] +// CHECK10: cond.false12: +// CHECK10-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: br label [[COND_END13]] +// CHECK10: cond.end13: +// CHECK10-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP46]], [[COND_TRUE11]] ], [ [[TMP47]], [[COND_FALSE12]] ] +// CHECK10-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: store i32 [[TMP48]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP10:![0-9]+]] +// CHECK10: omp.inner.for.end: +// CHECK10-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK10: omp.loop.exit: +// CHECK10-NEXT: [[TMP49:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK10-NEXT: [[TMP50:%.*]] = load i32, i32* [[TMP49]], align 4 +// CHECK10-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP50]]) +// CHECK10-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK10-NEXT: [[TMP52:%.*]] = icmp ne i32 [[TMP51]], 0 +// CHECK10-NEXT: br i1 [[TMP52]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK10: .omp.final.then: +// CHECK10-NEXT: [[TMP53:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK10-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP53]], 0 +// CHECK10-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1 +// CHECK10-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV16]], 1 +// CHECK10-NEXT: [[ADD17:%.*]] = add nsw i32 0, [[MUL]] +// CHECK10-NEXT: store i32 [[ADD17]], i32* [[I4]], align 4 +// CHECK10-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK10: .omp.final.done: +// CHECK10-NEXT: [[TMP54:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK10-NEXT: [[TMP55:%.*]] = icmp ne i32 [[TMP54]], 0 +// CHECK10-NEXT: br i1 [[TMP55]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK10: .omp.lastprivate.then: +// CHECK10-NEXT: [[TMP56:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK10-NEXT: store i32 [[TMP56]], i32* [[L_ADDR]], align 4 +// CHECK10-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK10: .omp.lastprivate.done: +// CHECK10-NEXT: br label [[OMP_PRECOND_END]] +// CHECK10: omp.precond.end: +// CHECK10-NEXT: [[TMP57:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK10-NEXT: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[TMP57]]) +// CHECK10-NEXT: ret void +// CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK10-NEXT: entry: +// CHECK10-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK10-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK10-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK10-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK10-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK10-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK10-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK10-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK10-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK10-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK10-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK10-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK10-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK10-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK10-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK10-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK10-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK10-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK10-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK10-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK10-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK10-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK10: omp.precond.then: +// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK10-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK10-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK10-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK10-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK10-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK10-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK10-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK10-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK10-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK10-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) +// CHECK10-NEXT: br label [[OMP_DISPATCH_COND:%.*]] +// CHECK10: omp.dispatch.cond: +// CHECK10-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK10-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK10-NEXT: [[CMP4:%.*]] = icmp ugt i32 [[TMP9]], [[TMP10]] +// CHECK10-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK10: cond.true: +// CHECK10-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK10-NEXT: br label [[COND_END:%.*]] +// CHECK10: cond.false: +// CHECK10-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK10-NEXT: br label [[COND_END]] +// CHECK10: cond.end: +// CHECK10-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] +// CHECK10-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 +// CHECK10-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK10-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK10-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] +// CHECK10-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] +// CHECK10: omp.dispatch.body: +// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK10: omp.inner.for.cond: +// CHECK10-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK10-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] +// CHECK10-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK10: omp.inner.for.body: +// CHECK10-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK10-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK10-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK10-NEXT: [[TMP19:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK10-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP19]] +// CHECK10-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 +// CHECK10-NEXT: [[TMP20:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK10-NEXT: store i32 [[TMP20]], i32* [[L_ADDR]], align 4 +// CHECK10-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK10: omp.body.continue: +// CHECK10-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK10: omp.inner.for.inc: +// CHECK10-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP21]], 1 +// CHECK10-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP12:![0-9]+]] +// CHECK10: omp.inner.for.end: +// CHECK10-NEXT: br label [[OMP_DISPATCH_INC:%.*]] +// CHECK10: omp.dispatch.inc: +// CHECK10-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK10-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK10-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_LB]], align 4 +// CHECK10-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK10-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK10-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_UB]], align 4 +// CHECK10-NEXT: br label [[OMP_DISPATCH_COND]] +// CHECK10: omp.dispatch.end: +// CHECK10-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK10-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK10-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) +// CHECK10-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK10-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 +// CHECK10-NEXT: br i1 [[TMP29]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK10: .omp.final.then: +// CHECK10-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK10-NEXT: [[SUB10:%.*]] = sub nsw i32 [[TMP30]], 0 +// CHECK10-NEXT: [[DIV11:%.*]] = sdiv i32 [[SUB10]], 1 +// CHECK10-NEXT: [[MUL12:%.*]] = mul nsw i32 [[DIV11]], 1 +// CHECK10-NEXT: [[ADD13:%.*]] = add nsw i32 0, [[MUL12]] +// CHECK10-NEXT: store i32 [[ADD13]], i32* [[I3]], align 4 +// CHECK10-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK10: .omp.final.done: +// CHECK10-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK10-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0 +// CHECK10-NEXT: br i1 [[TMP32]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK10: .omp.lastprivate.then: +// CHECK10-NEXT: [[TMP33:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK10-NEXT: store i32 [[TMP33]], i32* [[L_ADDR]], align 4 +// CHECK10-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK10: .omp.lastprivate.done: +// CHECK10-NEXT: br label [[OMP_PRECOND_END]] +// CHECK10: omp.precond.end: +// CHECK10-NEXT: ret void +// CHECK10-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44 +// CHECK10-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK10-NEXT: entry: +// CHECK10-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK10-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK10-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK10-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK10-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK10-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK10-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK10-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK10: .execute: +// CHECK10-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK10-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK10-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK10-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK10-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK10-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] +// CHECK10-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK10: .omp.deinit: +// CHECK10-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK10-NEXT: br label [[DOTEXIT:%.*]] +// CHECK10: .exit: +// CHECK10-NEXT: ret void +// CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK10-NEXT: entry: +// CHECK10-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK10-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK10-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK10-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK10-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK10-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK10-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK10-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK10-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK10-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK10-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK10-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK10-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK10-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK10-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK10-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK10-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK10-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK10-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK10-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK10: omp.precond.then: +// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK10-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK10-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK10-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK10-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK10-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK10-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK10-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK10-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK10: cond.true: +// CHECK10-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK10-NEXT: br label [[COND_END:%.*]] +// CHECK10: cond.false: +// CHECK10-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: br label [[COND_END]] +// CHECK10: cond.end: +// CHECK10-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK10-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK10: omp.inner.for.cond: +// CHECK10-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK10-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK10-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK10-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK10: omp.inner.for.body: +// CHECK10-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK10-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 +// CHECK10-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK10-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK10-NEXT: [[TMP19:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK10-NEXT: store i8* [[TMP19]], i8** [[TMP18]], align 4 +// CHECK10-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK10-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK10-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 +// CHECK10-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK10-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK10-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 +// CHECK10-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK10-NEXT: [[TMP25:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* +// CHECK10-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 +// CHECK10-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK10-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK10-NEXT: [[TMP28:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK10-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP27]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP28]], i32 4) +// CHECK10-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK10: omp.inner.for.inc: +// CHECK10-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP29]], [[TMP30]] +// CHECK10-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] +// CHECK10-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK10-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK10-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] +// CHECK10-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] +// CHECK10: cond.true10: +// CHECK10-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK10-NEXT: br label [[COND_END12:%.*]] +// CHECK10: cond.false11: +// CHECK10-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: br label [[COND_END12]] +// CHECK10: cond.end12: +// CHECK10-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP37]], [[COND_TRUE10]] ], [ [[TMP38]], [[COND_FALSE11]] ] +// CHECK10-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: store i32 [[TMP39]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]] +// CHECK10: omp.inner.for.end: +// CHECK10-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK10: omp.loop.exit: +// CHECK10-NEXT: [[TMP40:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK10-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 +// CHECK10-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]]) +// CHECK10-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK10-NEXT: [[TMP43:%.*]] = icmp ne i32 [[TMP42]], 0 +// CHECK10-NEXT: br i1 [[TMP43]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK10: .omp.final.then: +// CHECK10-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK10-NEXT: [[SUB14:%.*]] = sub nsw i32 [[TMP44]], 0 +// CHECK10-NEXT: [[DIV15:%.*]] = sdiv i32 [[SUB14]], 1 +// CHECK10-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV15]], 1 +// CHECK10-NEXT: [[ADD16:%.*]] = add nsw i32 0, [[MUL]] +// CHECK10-NEXT: store i32 [[ADD16]], i32* [[I3]], align 4 +// CHECK10-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK10: .omp.final.done: +// CHECK10-NEXT: br label [[OMP_PRECOND_END]] +// CHECK10: omp.precond.end: +// CHECK10-NEXT: ret void +// CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK10-NEXT: entry: +// CHECK10-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK10-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK10-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK10-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK10-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK10-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK10-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK10-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK10-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK10-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK10-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK10-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK10-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK10-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK10-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK10-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK10-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK10-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK10-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK10-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK10-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK10: omp.precond.then: +// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK10-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK10-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK10-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK10-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK10-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK10-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK10-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK10-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK10-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK10-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK10-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK10-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK10: omp.inner.for.cond: +// CHECK10-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK10-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] +// CHECK10-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK10: omp.inner.for.body: +// CHECK10-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK10-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK10-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK10-NEXT: [[TMP13:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK10-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i32 0, i32 [[TMP13]] +// CHECK10-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +// CHECK10-NEXT: [[CONV:%.*]] = sext i16 [[TMP14]] to i32 +// CHECK10-NEXT: [[ADD5:%.*]] = add nsw i32 [[CONV]], 1 +// CHECK10-NEXT: [[CONV6:%.*]] = trunc i32 [[ADD5]] to i16 +// CHECK10-NEXT: store i16 [[CONV6]], i16* [[ARRAYIDX]], align 2 +// CHECK10-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK10: omp.body.continue: +// CHECK10-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK10: omp.inner.for.inc: +// CHECK10-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] +// CHECK10-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP14:![0-9]+]] +// CHECK10: omp.inner.for.end: +// CHECK10-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK10: omp.loop.exit: +// CHECK10-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK10-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 +// CHECK10-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) +// CHECK10-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK10-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 +// CHECK10-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK10: .omp.final.then: +// CHECK10-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK10-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP21]], 0 +// CHECK10-NEXT: [[DIV9:%.*]] = sdiv i32 [[SUB8]], 1 +// CHECK10-NEXT: [[MUL10:%.*]] = mul nsw i32 [[DIV9]], 1 +// CHECK10-NEXT: [[ADD11:%.*]] = add nsw i32 0, [[MUL10]] +// CHECK10-NEXT: store i32 [[ADD11]], i32* [[I3]], align 4 +// CHECK10-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK10: .omp.final.done: +// CHECK10-NEXT: br label [[OMP_PRECOND_END]] +// CHECK10: omp.precond.end: +// CHECK10-NEXT: ret void +// CHECK10-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 +// CHECK10-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK10-NEXT: entry: +// CHECK10-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK10-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK10-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK10-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK10-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK10-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK10-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK10: .execute: +// CHECK10-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK10-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK10-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] +// CHECK10-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK10: .omp.deinit: +// CHECK10-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK10-NEXT: br label [[DOTEXIT:%.*]] +// CHECK10: .exit: +// CHECK10-NEXT: ret void +// CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK10-NEXT: entry: +// CHECK10-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK10-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK10-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK10-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 +// CHECK10-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK10-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK10-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK10-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK10-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK10-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK10-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK10-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK10-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 +// CHECK10-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK10: cond.true: +// CHECK10-NEXT: br label [[COND_END:%.*]] +// CHECK10: cond.false: +// CHECK10-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: br label [[COND_END]] +// CHECK10: cond.end: +// CHECK10-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK10-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK10: omp.inner.for.cond: +// CHECK10-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 +// CHECK10-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK10: omp.inner.for.body: +// CHECK10-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK10-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK10-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK10-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK10-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK10-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK10-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK10-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK10-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK10-NEXT: [[TMP15:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK10-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP15]], i32 3) +// CHECK10-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK10: omp.inner.for.inc: +// CHECK10-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP17]] +// CHECK10-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] +// CHECK10-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK10-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP22]], 9 +// CHECK10-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] +// CHECK10: cond.true5: +// CHECK10-NEXT: br label [[COND_END7:%.*]] +// CHECK10: cond.false6: +// CHECK10-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: br label [[COND_END7]] +// CHECK10: cond.end7: +// CHECK10-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP23]], [[COND_FALSE6]] ] +// CHECK10-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: store i32 [[TMP24]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP15:![0-9]+]] +// CHECK10: omp.inner.for.end: +// CHECK10-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK10: omp.loop.exit: +// CHECK10-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK10-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK10-NEXT: [[TMP26:%.*]] = icmp ne i32 [[TMP25]], 0 +// CHECK10-NEXT: br i1 [[TMP26]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK10: .omp.final.then: +// CHECK10-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK10-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK10: .omp.final.done: +// CHECK10-NEXT: ret void +// CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__5 +// CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK10-NEXT: entry: +// CHECK10-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK10-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK10-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK10-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK10-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK10-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK10-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK10-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK10-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK10-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK10-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK10-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK10-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK10-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK10-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK10-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK10-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK10-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK10-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK10-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK10: omp.inner.for.cond: +// CHECK10-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK10-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK10-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK10: omp.inner.for.body: +// CHECK10-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 +// CHECK10-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK10-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK10-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 +// CHECK10-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 [[TMP9]] +// CHECK10-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK10-NEXT: [[ADD1:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK10-NEXT: store i32 [[ADD1]], i32* [[ARRAYIDX]], align 4 +// CHECK10-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK10: omp.body.continue: +// CHECK10-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK10: omp.inner.for.inc: +// CHECK10-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] +// CHECK10-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]] +// CHECK10: omp.inner.for.end: +// CHECK10-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK10: omp.loop.exit: +// CHECK10-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK10-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK10-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 +// CHECK10-NEXT: br i1 [[TMP14]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK10: .omp.final.then: +// CHECK10-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK10-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK10: .omp.final.done: +// CHECK10-NEXT: ret void +// CHECK10-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 +// CHECK10-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK10-NEXT: entry: +// CHECK10-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK10-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK10-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK10-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK10-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK10-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK10-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK10-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK10: .execute: +// CHECK10-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK10-NEXT: [[TMP2:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK10-NEXT: store i32 [[TMP2]], i32* [[F_CASTED]], align 4 +// CHECK10-NEXT: [[TMP3:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK10-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK10-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i32 [[TMP3]]) #[[ATTR2]] +// CHECK10-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK10: .omp.deinit: +// CHECK10-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK10-NEXT: br label [[DOTEXIT:%.*]] +// CHECK10: .exit: +// CHECK10-NEXT: ret void +// CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__6 +// CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK10-NEXT: entry: +// CHECK10-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK10-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK10-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK10-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK10-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK10-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK10-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK10-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK10-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK10-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK10-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK10-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK10-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK10-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 +// CHECK10-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK10: cond.true: +// CHECK10-NEXT: br label [[COND_END:%.*]] +// CHECK10: cond.false: +// CHECK10-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: br label [[COND_END]] +// CHECK10: cond.end: +// CHECK10-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK10-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK10: omp.inner.for.cond: +// CHECK10-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 +// CHECK10-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK10: omp.inner.for.body: +// CHECK10-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP9:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK10-NEXT: store i32 [[TMP9]], i32* [[F_CASTED]], align 4 +// CHECK10-NEXT: [[TMP10:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK10-NEXT: [[TMP11:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK10-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK10-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK10-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK10-NEXT: [[TMP14:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK10-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK10-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK10-NEXT: [[TMP16:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK10-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 4 +// CHECK10-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK10-NEXT: [[TMP18:%.*]] = inttoptr i32 [[TMP10]] to i8* +// CHECK10-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 4 +// CHECK10-NEXT: [[TMP19:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK10-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x [10 x i32]]*, i32)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP19]], i32 4) +// CHECK10-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK10: omp.inner.for.inc: +// CHECK10-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK10-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK10-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK10-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP26]], 99 +// CHECK10-NEXT: br i1 [[CMP5]], label [[COND_TRUE6:%.*]], label [[COND_FALSE7:%.*]] +// CHECK10: cond.true6: +// CHECK10-NEXT: br label [[COND_END8:%.*]] +// CHECK10: cond.false7: +// CHECK10-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: br label [[COND_END8]] +// CHECK10: cond.end8: +// CHECK10-NEXT: [[COND9:%.*]] = phi i32 [ 99, [[COND_TRUE6]] ], [ [[TMP27]], [[COND_FALSE7]] ] +// CHECK10-NEXT: store i32 [[COND9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK10-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK10-NEXT: store i32 [[TMP28]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]] +// CHECK10: omp.inner.for.end: +// CHECK10-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK10: omp.loop.exit: +// CHECK10-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK10-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK10-NEXT: [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0 +// CHECK10-NEXT: br i1 [[TMP30]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK10: .omp.final.then: +// CHECK10-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK10-NEXT: store i32 10, i32* [[J]], align 4 +// CHECK10-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK10: .omp.final.done: +// CHECK10-NEXT: ret void +// CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__7 +// CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK10-NEXT: entry: +// CHECK10-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK10-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK10-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK10-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK10-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK10-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK10-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK10-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK10-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK10-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK10-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK10-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 +// CHECK10-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK10-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK10-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK10-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK10-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK10-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK10-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK10-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK10-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK10-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK10: omp.inner.for.cond: +// CHECK10-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK10-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK10-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK10: omp.inner.for.body: +// CHECK10-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 +// CHECK10-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 +// CHECK10-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK10-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK10-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[DIV2:%.*]] = sdiv i32 [[TMP10]], 10 +// CHECK10-NEXT: [[MUL3:%.*]] = mul nsw i32 [[DIV2]], 10 +// CHECK10-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL3]] +// CHECK10-NEXT: [[MUL4:%.*]] = mul nsw i32 [[SUB]], 1 +// CHECK10-NEXT: [[ADD5:%.*]] = add nsw i32 0, [[MUL4]] +// CHECK10-NEXT: store i32 [[ADD5]], i32* [[J]], align 4 +// CHECK10-NEXT: store i32 10, i32* [[K]], align 4 +// CHECK10-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 +// CHECK10-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 +// CHECK10-NEXT: [[TMP13:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK10-NEXT: [[MUL6:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] +// CHECK10-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP11]], [[MUL6]] +// CHECK10-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 +// CHECK10-NEXT: [[ADD8:%.*]] = add nsw i32 [[ADD7]], [[TMP14]] +// CHECK10-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 +// CHECK10-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP15]] +// CHECK10-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 +// CHECK10-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP16]] +// CHECK10-NEXT: store i32 [[ADD8]], i32* [[ARRAYIDX9]], align 4 +// CHECK10-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK10: omp.body.continue: +// CHECK10-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK10: omp.inner.for.inc: +// CHECK10-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK10-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK10-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP18:![0-9]+]] +// CHECK10: omp.inner.for.end: +// CHECK10-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK10: omp.loop.exit: +// CHECK10-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK10-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK10-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 +// CHECK10-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK10: .omp.final.then: +// CHECK10-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK10-NEXT: store i32 10, i32* [[J]], align 4 +// CHECK10-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK10: .omp.final.done: +// CHECK10-NEXT: ret void +// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38 +// CHECK11-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK11-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK11-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK11-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK11-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK11-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK11-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK11: .execute: +// CHECK11-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK11-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK11-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK11-NEXT: [[TMP4:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP4]], i32* [[L_CASTED]], align 4 +// CHECK11-NEXT: [[TMP5:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK11-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK11-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32 [[TMP5]]) #[[ATTR2:[0-9]+]] +// CHECK11-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK11: .omp.deinit: +// CHECK11-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK11-NEXT: br label [[DOTEXIT:%.*]] +// CHECK11: .exit: +// CHECK11-NEXT: ret void +// CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK11-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK11-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[I4:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 +// CHECK11-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK11-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK11-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK11-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK11-NEXT: [[TMP1:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 4, i16 1) +// CHECK11-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to %struct._globalized_locals_ty* +// CHECK11-NEXT: [[L1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP2]], i32 0, i32 0 +// CHECK11-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP3]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK11-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK11-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK11-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK11-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK11-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK11: omp.precond.then: +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK11-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP8]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) +// CHECK11-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK11-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]] +// CHECK11-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK11: cond.true: +// CHECK11-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK11-NEXT: br label [[COND_END:%.*]] +// CHECK11: cond.false: +// CHECK11-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: br label [[COND_END]] +// CHECK11: cond.end: +// CHECK11-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] +// CHECK11-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK11: omp.inner.for.cond: +// CHECK11-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP15]], 1 +// CHECK11-NEXT: [[CMP6:%.*]] = icmp slt i32 [[TMP14]], [[ADD]] +// CHECK11-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK11: omp.inner.for.body: +// CHECK11-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP18:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP18]], i32* [[N_CASTED]], align 4 +// CHECK11-NEXT: [[TMP19:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK11-NEXT: [[TMP20:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP20]], i32* [[L_CASTED]], align 4 +// CHECK11-NEXT: [[TMP21:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK11-NEXT: [[TMP22:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK11-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP16]] to i8* +// CHECK11-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 +// CHECK11-NEXT: [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK11-NEXT: [[TMP25:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK11-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 +// CHECK11-NEXT: [[TMP26:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK11-NEXT: [[TMP27:%.*]] = inttoptr i32 [[TMP19]] to i8* +// CHECK11-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 4 +// CHECK11-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK11-NEXT: [[TMP29:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK11-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 4 +// CHECK11-NEXT: [[TMP30:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK11-NEXT: [[TMP31:%.*]] = inttoptr i32 [[TMP21]] to i8* +// CHECK11-NEXT: store i8* [[TMP31]], i8** [[TMP30]], align 4 +// CHECK11-NEXT: [[TMP32:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: [[TMP33:%.*]] = load i32, i32* [[TMP32]], align 4 +// CHECK11-NEXT: [[TMP34:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK11-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP33]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP34]], i32 5) +// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK11: omp.inner.for.inc: +// CHECK11-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] +// CHECK11-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP37]], [[TMP38]] +// CHECK11-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP39]], [[TMP40]] +// CHECK11-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK11-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]] +// CHECK11-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK11: cond.true11: +// CHECK11-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK11-NEXT: br label [[COND_END13:%.*]] +// CHECK11: cond.false12: +// CHECK11-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: br label [[COND_END13]] +// CHECK11: cond.end13: +// CHECK11-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP43]], [[COND_TRUE11]] ], [ [[TMP44]], [[COND_FALSE12]] ] +// CHECK11-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP45]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP10:![0-9]+]] +// CHECK11: omp.inner.for.end: +// CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK11: omp.loop.exit: +// CHECK11-NEXT: [[TMP46:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: [[TMP47:%.*]] = load i32, i32* [[TMP46]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP47]]) +// CHECK11-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP49:%.*]] = icmp ne i32 [[TMP48]], 0 +// CHECK11-NEXT: br i1 [[TMP49]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK11: .omp.final.then: +// CHECK11-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP50]], 0 +// CHECK11-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1 +// CHECK11-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV16]], 1 +// CHECK11-NEXT: [[ADD17:%.*]] = add nsw i32 0, [[MUL]] +// CHECK11-NEXT: store i32 [[ADD17]], i32* [[I4]], align 4 +// CHECK11-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK11: .omp.final.done: +// CHECK11-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP52:%.*]] = icmp ne i32 [[TMP51]], 0 +// CHECK11-NEXT: br i1 [[TMP52]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK11: .omp.lastprivate.then: +// CHECK11-NEXT: [[TMP53:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP53]], i32* [[L_ADDR]], align 4 +// CHECK11-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK11: .omp.lastprivate.done: +// CHECK11-NEXT: br label [[OMP_PRECOND_END]] +// CHECK11: omp.precond.end: +// CHECK11-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP1]]) +// CHECK11-NEXT: ret void +// CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK11-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK11-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK11-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK11-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK11-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK11-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK11-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK11-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK11-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK11-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK11-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK11-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK11-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK11: omp.precond.then: +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK11-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) +// CHECK11-NEXT: br label [[OMP_DISPATCH_COND:%.*]] +// CHECK11: omp.dispatch.cond: +// CHECK11-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK11-NEXT: [[CMP4:%.*]] = icmp ugt i32 [[TMP9]], [[TMP10]] +// CHECK11-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK11: cond.true: +// CHECK11-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK11-NEXT: br label [[COND_END:%.*]] +// CHECK11: cond.false: +// CHECK11-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: br label [[COND_END]] +// CHECK11: cond.end: +// CHECK11-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] +// CHECK11-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] +// CHECK11-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] +// CHECK11: omp.dispatch.body: +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK11: omp.inner.for.cond: +// CHECK11-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] +// CHECK11-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK11: omp.inner.for.body: +// CHECK11-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK11-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK11-NEXT: [[TMP19:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK11-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP19]] +// CHECK11-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 +// CHECK11-NEXT: [[TMP20:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK11-NEXT: store i32 [[TMP20]], i32* [[L_ADDR]], align 4 +// CHECK11-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK11: omp.body.continue: +// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK11: omp.inner.for.inc: +// CHECK11-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP21]], 1 +// CHECK11-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP12:![0-9]+]] +// CHECK11: omp.inner.for.end: +// CHECK11-NEXT: br label [[OMP_DISPATCH_INC:%.*]] +// CHECK11: omp.dispatch.inc: +// CHECK11-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK11-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK11-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: br label [[OMP_DISPATCH_COND]] +// CHECK11: omp.dispatch.end: +// CHECK11-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) +// CHECK11-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 +// CHECK11-NEXT: br i1 [[TMP29]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK11: .omp.final.then: +// CHECK11-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[SUB10:%.*]] = sub nsw i32 [[TMP30]], 0 +// CHECK11-NEXT: [[DIV11:%.*]] = sdiv i32 [[SUB10]], 1 +// CHECK11-NEXT: [[MUL12:%.*]] = mul nsw i32 [[DIV11]], 1 +// CHECK11-NEXT: [[ADD13:%.*]] = add nsw i32 0, [[MUL12]] +// CHECK11-NEXT: store i32 [[ADD13]], i32* [[I3]], align 4 +// CHECK11-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK11: .omp.final.done: +// CHECK11-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0 +// CHECK11-NEXT: br i1 [[TMP32]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK11: .omp.lastprivate.then: +// CHECK11-NEXT: [[TMP33:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP33]], i32* [[L_ADDR]], align 4 +// CHECK11-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK11: .omp.lastprivate.done: +// CHECK11-NEXT: br label [[OMP_PRECOND_END]] +// CHECK11: omp.precond.end: +// CHECK11-NEXT: ret void +// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44 +// CHECK11-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK11-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK11-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK11-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK11-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK11-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK11-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK11: .execute: +// CHECK11-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK11-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK11-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK11-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK11-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] +// CHECK11-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK11: .omp.deinit: +// CHECK11-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK11-NEXT: br label [[DOTEXIT:%.*]] +// CHECK11: .exit: +// CHECK11-NEXT: ret void +// CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK11-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK11-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK11-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK11-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK11-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK11-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK11-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK11-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK11-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK11-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK11-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK11: omp.precond.then: +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK11-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK11-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK11-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK11: cond.true: +// CHECK11-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: br label [[COND_END:%.*]] +// CHECK11: cond.false: +// CHECK11-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: br label [[COND_END]] +// CHECK11: cond.end: +// CHECK11-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK11-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK11: omp.inner.for.cond: +// CHECK11-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK11-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK11-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK11: omp.inner.for.body: +// CHECK11-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 +// CHECK11-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK11-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK11-NEXT: [[TMP19:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK11-NEXT: store i8* [[TMP19]], i8** [[TMP18]], align 4 +// CHECK11-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK11-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK11-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 +// CHECK11-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK11-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK11-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 +// CHECK11-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK11-NEXT: [[TMP25:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* +// CHECK11-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 +// CHECK11-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK11-NEXT: [[TMP28:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK11-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP27]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP28]], i32 4) +// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK11: omp.inner.for.inc: +// CHECK11-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP29]], [[TMP30]] +// CHECK11-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] +// CHECK11-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK11-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] +// CHECK11-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] +// CHECK11: cond.true10: +// CHECK11-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: br label [[COND_END12:%.*]] +// CHECK11: cond.false11: +// CHECK11-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: br label [[COND_END12]] +// CHECK11: cond.end12: +// CHECK11-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP37]], [[COND_TRUE10]] ], [ [[TMP38]], [[COND_FALSE11]] ] +// CHECK11-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP39]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]] +// CHECK11: omp.inner.for.end: +// CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK11: omp.loop.exit: +// CHECK11-NEXT: [[TMP40:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]]) +// CHECK11-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP43:%.*]] = icmp ne i32 [[TMP42]], 0 +// CHECK11-NEXT: br i1 [[TMP43]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK11: .omp.final.then: +// CHECK11-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[SUB14:%.*]] = sub nsw i32 [[TMP44]], 0 +// CHECK11-NEXT: [[DIV15:%.*]] = sdiv i32 [[SUB14]], 1 +// CHECK11-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV15]], 1 +// CHECK11-NEXT: [[ADD16:%.*]] = add nsw i32 0, [[MUL]] +// CHECK11-NEXT: store i32 [[ADD16]], i32* [[I3]], align 4 +// CHECK11-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK11: .omp.final.done: +// CHECK11-NEXT: br label [[OMP_PRECOND_END]] +// CHECK11: omp.precond.end: +// CHECK11-NEXT: ret void +// CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK11-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK11-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK11-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK11-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK11-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK11-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK11-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK11-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK11-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK11-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK11-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK11-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK11: omp.precond.then: +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK11-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK11-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK11: omp.inner.for.cond: +// CHECK11-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK11-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] +// CHECK11-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK11: omp.inner.for.body: +// CHECK11-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK11-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK11-NEXT: [[TMP13:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK11-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i32 0, i32 [[TMP13]] +// CHECK11-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +// CHECK11-NEXT: [[CONV:%.*]] = sext i16 [[TMP14]] to i32 +// CHECK11-NEXT: [[ADD5:%.*]] = add nsw i32 [[CONV]], 1 +// CHECK11-NEXT: [[CONV6:%.*]] = trunc i32 [[ADD5]] to i16 +// CHECK11-NEXT: store i16 [[CONV6]], i16* [[ARRAYIDX]], align 2 +// CHECK11-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK11: omp.body.continue: +// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK11: omp.inner.for.inc: +// CHECK11-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] +// CHECK11-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP14:![0-9]+]] +// CHECK11: omp.inner.for.end: +// CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK11: omp.loop.exit: +// CHECK11-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) +// CHECK11-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 +// CHECK11-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK11: .omp.final.then: +// CHECK11-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP21]], 0 +// CHECK11-NEXT: [[DIV9:%.*]] = sdiv i32 [[SUB8]], 1 +// CHECK11-NEXT: [[MUL10:%.*]] = mul nsw i32 [[DIV9]], 1 +// CHECK11-NEXT: [[ADD11:%.*]] = add nsw i32 0, [[MUL10]] +// CHECK11-NEXT: store i32 [[ADD11]], i32* [[I3]], align 4 +// CHECK11-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK11: .omp.final.done: +// CHECK11-NEXT: br label [[OMP_PRECOND_END]] +// CHECK11: omp.precond.end: +// CHECK11-NEXT: ret void +// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 +// CHECK11-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK11-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK11-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK11-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK11-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK11-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK11-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK11: .execute: +// CHECK11-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK11-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK11-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] +// CHECK11-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK11: .omp.deinit: +// CHECK11-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK11-NEXT: br label [[DOTEXIT:%.*]] +// CHECK11: .exit: +// CHECK11-NEXT: ret void +// CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK11-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 +// CHECK11-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK11-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK11-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK11-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK11-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 +// CHECK11-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK11: cond.true: +// CHECK11-NEXT: br label [[COND_END:%.*]] +// CHECK11: cond.false: +// CHECK11-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: br label [[COND_END]] +// CHECK11: cond.end: +// CHECK11-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK11-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK11: omp.inner.for.cond: +// CHECK11-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 +// CHECK11-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK11: omp.inner.for.body: +// CHECK11-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK11-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK11-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK11-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK11-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK11-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK11-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK11-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK11-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK11-NEXT: [[TMP15:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK11-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP15]], i32 3) +// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK11: omp.inner.for.inc: +// CHECK11-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP17]] +// CHECK11-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] +// CHECK11-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK11-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP22]], 9 +// CHECK11-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] +// CHECK11: cond.true5: +// CHECK11-NEXT: br label [[COND_END7:%.*]] +// CHECK11: cond.false6: +// CHECK11-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: br label [[COND_END7]] +// CHECK11: cond.end7: +// CHECK11-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP23]], [[COND_FALSE6]] ] +// CHECK11-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP24]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP15:![0-9]+]] +// CHECK11: omp.inner.for.end: +// CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK11: omp.loop.exit: +// CHECK11-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK11-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP26:%.*]] = icmp ne i32 [[TMP25]], 0 +// CHECK11-NEXT: br i1 [[TMP26]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK11: .omp.final.then: +// CHECK11-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK11-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK11: .omp.final.done: +// CHECK11-NEXT: ret void +// CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__5 +// CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK11-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK11-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK11-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK11-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK11-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK11-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK11-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK11: omp.inner.for.cond: +// CHECK11-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK11-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK11-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK11: omp.inner.for.body: +// CHECK11-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 +// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK11-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK11-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 +// CHECK11-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 [[TMP9]] +// CHECK11-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK11-NEXT: [[ADD1:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK11-NEXT: store i32 [[ADD1]], i32* [[ARRAYIDX]], align 4 +// CHECK11-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK11: omp.body.continue: +// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK11: omp.inner.for.inc: +// CHECK11-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] +// CHECK11-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]] +// CHECK11: omp.inner.for.end: +// CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK11: omp.loop.exit: +// CHECK11-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK11-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 +// CHECK11-NEXT: br i1 [[TMP14]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK11: .omp.final.then: +// CHECK11-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK11-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK11: .omp.final.done: +// CHECK11-NEXT: ret void +// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 +// CHECK11-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK11-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK11-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK11-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK11-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK11-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK11-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK11: .execute: +// CHECK11-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK11-NEXT: [[TMP2:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP2]], i32* [[F_CASTED]], align 4 +// CHECK11-NEXT: [[TMP3:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK11-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK11-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i32 [[TMP3]]) #[[ATTR2]] +// CHECK11-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK11: .omp.deinit: +// CHECK11-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK11-NEXT: br label [[DOTEXIT:%.*]] +// CHECK11: .exit: +// CHECK11-NEXT: ret void +// CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__6 +// CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK11-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK11-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK11-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK11-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK11-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK11-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK11-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 +// CHECK11-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK11: cond.true: +// CHECK11-NEXT: br label [[COND_END:%.*]] +// CHECK11: cond.false: +// CHECK11-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: br label [[COND_END]] +// CHECK11: cond.end: +// CHECK11-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK11-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK11: omp.inner.for.cond: +// CHECK11-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 +// CHECK11-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK11: omp.inner.for.body: +// CHECK11-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP9:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP9]], i32* [[F_CASTED]], align 4 +// CHECK11-NEXT: [[TMP10:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK11-NEXT: [[TMP11:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK11-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK11-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK11-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK11-NEXT: [[TMP14:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK11-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK11-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK11-NEXT: [[TMP16:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK11-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 4 +// CHECK11-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK11-NEXT: [[TMP18:%.*]] = inttoptr i32 [[TMP10]] to i8* +// CHECK11-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 4 +// CHECK11-NEXT: [[TMP19:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK11-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x [10 x i32]]*, i32)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP19]], i32 4) +// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK11: omp.inner.for.inc: +// CHECK11-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK11-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK11-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK11-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP26]], 99 +// CHECK11-NEXT: br i1 [[CMP5]], label [[COND_TRUE6:%.*]], label [[COND_FALSE7:%.*]] +// CHECK11: cond.true6: +// CHECK11-NEXT: br label [[COND_END8:%.*]] +// CHECK11: cond.false7: +// CHECK11-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: br label [[COND_END8]] +// CHECK11: cond.end8: +// CHECK11-NEXT: [[COND9:%.*]] = phi i32 [ 99, [[COND_TRUE6]] ], [ [[TMP27]], [[COND_FALSE7]] ] +// CHECK11-NEXT: store i32 [[COND9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP28]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]] +// CHECK11: omp.inner.for.end: +// CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK11: omp.loop.exit: +// CHECK11-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK11-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0 +// CHECK11-NEXT: br i1 [[TMP30]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK11: .omp.final.then: +// CHECK11-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK11-NEXT: store i32 10, i32* [[J]], align 4 +// CHECK11-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK11: .omp.final.done: +// CHECK11-NEXT: ret void +// CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__7 +// CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK11-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK11-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK11-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK11-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK11-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK11-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK11-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK11-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK11: omp.inner.for.cond: +// CHECK11-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK11-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK11-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK11: omp.inner.for.body: +// CHECK11-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 +// CHECK11-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 +// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK11-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK11-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[DIV2:%.*]] = sdiv i32 [[TMP10]], 10 +// CHECK11-NEXT: [[MUL3:%.*]] = mul nsw i32 [[DIV2]], 10 +// CHECK11-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL3]] +// CHECK11-NEXT: [[MUL4:%.*]] = mul nsw i32 [[SUB]], 1 +// CHECK11-NEXT: [[ADD5:%.*]] = add nsw i32 0, [[MUL4]] +// CHECK11-NEXT: store i32 [[ADD5]], i32* [[J]], align 4 +// CHECK11-NEXT: store i32 10, i32* [[K]], align 4 +// CHECK11-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 +// CHECK11-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 +// CHECK11-NEXT: [[TMP13:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK11-NEXT: [[MUL6:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] +// CHECK11-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP11]], [[MUL6]] +// CHECK11-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 +// CHECK11-NEXT: [[ADD8:%.*]] = add nsw i32 [[ADD7]], [[TMP14]] +// CHECK11-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 +// CHECK11-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP15]] +// CHECK11-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 +// CHECK11-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP16]] +// CHECK11-NEXT: store i32 [[ADD8]], i32* [[ARRAYIDX9]], align 4 +// CHECK11-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK11: omp.body.continue: +// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK11: omp.inner.for.inc: +// CHECK11-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK11-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP18:![0-9]+]] +// CHECK11: omp.inner.for.end: +// CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK11: omp.loop.exit: +// CHECK11-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK11-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 +// CHECK11-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK11: .omp.final.then: +// CHECK11-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK11-NEXT: store i32 10, i32* [[J]], align 4 +// CHECK11-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK11: .omp.final.done: +// CHECK11-NEXT: ret void +// CHECK12-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38 +// CHECK12-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK12-NEXT: entry: +// CHECK12-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK12-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK12-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK12-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK12-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK12-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK12-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK12: .execute: +// CHECK12-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK12-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK12-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK12-NEXT: [[TMP4:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP4]], i32* [[L_CASTED]], align 4 +// CHECK12-NEXT: [[TMP5:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK12-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK12-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32 [[TMP5]]) #[[ATTR2:[0-9]+]] +// CHECK12-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK12: .omp.deinit: +// CHECK12-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK12-NEXT: br label [[DOTEXIT:%.*]] +// CHECK12: .exit: +// CHECK12-NEXT: ret void +// CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK12-NEXT: entry: +// CHECK12-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK12-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[I4:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 +// CHECK12-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK12-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK12-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK12-NEXT: [[TMP1:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 4, i16 1) +// CHECK12-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to %struct._globalized_locals_ty* +// CHECK12-NEXT: [[L1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP2]], i32 0, i32 0 +// CHECK12-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP3]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK12-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK12-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK12-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK12-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK12-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK12: omp.precond.then: +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK12-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK12-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP8]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) +// CHECK12-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK12-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]] +// CHECK12-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK12: cond.true: +// CHECK12-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK12-NEXT: br label [[COND_END:%.*]] +// CHECK12: cond.false: +// CHECK12-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: br label [[COND_END]] +// CHECK12: cond.end: +// CHECK12-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] +// CHECK12-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK12: omp.inner.for.cond: +// CHECK12-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK12-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP15]], 1 +// CHECK12-NEXT: [[CMP6:%.*]] = icmp slt i32 [[TMP14]], [[ADD]] +// CHECK12-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK12: omp.inner.for.body: +// CHECK12-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP18:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP18]], i32* [[N_CASTED]], align 4 +// CHECK12-NEXT: [[TMP19:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK12-NEXT: [[TMP20:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP20]], i32* [[L_CASTED]], align 4 +// CHECK12-NEXT: [[TMP21:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK12-NEXT: [[TMP22:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK12-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP16]] to i8* +// CHECK12-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 +// CHECK12-NEXT: [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK12-NEXT: [[TMP25:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK12-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 +// CHECK12-NEXT: [[TMP26:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK12-NEXT: [[TMP27:%.*]] = inttoptr i32 [[TMP19]] to i8* +// CHECK12-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 4 +// CHECK12-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK12-NEXT: [[TMP29:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK12-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 4 +// CHECK12-NEXT: [[TMP30:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK12-NEXT: [[TMP31:%.*]] = inttoptr i32 [[TMP21]] to i8* +// CHECK12-NEXT: store i8* [[TMP31]], i8** [[TMP30]], align 4 +// CHECK12-NEXT: [[TMP32:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP33:%.*]] = load i32, i32* [[TMP32]], align 4 +// CHECK12-NEXT: [[TMP34:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK12-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP33]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP34]], i32 5) +// CHECK12-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK12: omp.inner.for.inc: +// CHECK12-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] +// CHECK12-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP37]], [[TMP38]] +// CHECK12-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP39]], [[TMP40]] +// CHECK12-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK12-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]] +// CHECK12-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK12: cond.true11: +// CHECK12-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK12-NEXT: br label [[COND_END13:%.*]] +// CHECK12: cond.false12: +// CHECK12-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: br label [[COND_END13]] +// CHECK12: cond.end13: +// CHECK12-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP43]], [[COND_TRUE11]] ], [ [[TMP44]], [[COND_FALSE12]] ] +// CHECK12-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP45]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP10:![0-9]+]] +// CHECK12: omp.inner.for.end: +// CHECK12-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK12: omp.loop.exit: +// CHECK12-NEXT: [[TMP46:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP47:%.*]] = load i32, i32* [[TMP46]], align 4 +// CHECK12-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP47]]) +// CHECK12-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[TMP49:%.*]] = icmp ne i32 [[TMP48]], 0 +// CHECK12-NEXT: br i1 [[TMP49]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK12: .omp.final.then: +// CHECK12-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP50]], 0 +// CHECK12-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1 +// CHECK12-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV16]], 1 +// CHECK12-NEXT: [[ADD17:%.*]] = add nsw i32 0, [[MUL]] +// CHECK12-NEXT: store i32 [[ADD17]], i32* [[I4]], align 4 +// CHECK12-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK12: .omp.final.done: +// CHECK12-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[TMP52:%.*]] = icmp ne i32 [[TMP51]], 0 +// CHECK12-NEXT: br i1 [[TMP52]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK12: .omp.lastprivate.then: +// CHECK12-NEXT: [[TMP53:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP53]], i32* [[L_ADDR]], align 4 +// CHECK12-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK12: .omp.lastprivate.done: +// CHECK12-NEXT: br label [[OMP_PRECOND_END]] +// CHECK12: omp.precond.end: +// CHECK12-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP1]]) +// CHECK12-NEXT: ret void +// CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK12-NEXT: entry: +// CHECK12-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK12-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK12-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK12-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK12-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK12-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK12-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK12-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK12-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK12-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK12: omp.precond.then: +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK12-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK12-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK12-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK12-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK12-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK12-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) +// CHECK12-NEXT: br label [[OMP_DISPATCH_COND:%.*]] +// CHECK12: omp.dispatch.cond: +// CHECK12-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK12-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK12-NEXT: [[CMP4:%.*]] = icmp ugt i32 [[TMP9]], [[TMP10]] +// CHECK12-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK12: cond.true: +// CHECK12-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK12-NEXT: br label [[COND_END:%.*]] +// CHECK12: cond.false: +// CHECK12-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK12-NEXT: br label [[COND_END]] +// CHECK12: cond.end: +// CHECK12-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] +// CHECK12-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 +// CHECK12-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK12-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] +// CHECK12-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] +// CHECK12: omp.dispatch.body: +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK12: omp.inner.for.cond: +// CHECK12-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK12-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] +// CHECK12-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK12: omp.inner.for.body: +// CHECK12-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK12-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK12-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK12-NEXT: [[TMP19:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK12-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP19]] +// CHECK12-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 +// CHECK12-NEXT: [[TMP20:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK12-NEXT: store i32 [[TMP20]], i32* [[L_ADDR]], align 4 +// CHECK12-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK12: omp.body.continue: +// CHECK12-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK12: omp.inner.for.inc: +// CHECK12-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP21]], 1 +// CHECK12-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP12:![0-9]+]] +// CHECK12: omp.inner.for.end: +// CHECK12-NEXT: br label [[OMP_DISPATCH_INC:%.*]] +// CHECK12: omp.dispatch.inc: +// CHECK12-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK12-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK12-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_LB]], align 4 +// CHECK12-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK12-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK12-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_UB]], align 4 +// CHECK12-NEXT: br label [[OMP_DISPATCH_COND]] +// CHECK12: omp.dispatch.end: +// CHECK12-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK12-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) +// CHECK12-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 +// CHECK12-NEXT: br i1 [[TMP29]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK12: .omp.final.then: +// CHECK12-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[SUB10:%.*]] = sub nsw i32 [[TMP30]], 0 +// CHECK12-NEXT: [[DIV11:%.*]] = sdiv i32 [[SUB10]], 1 +// CHECK12-NEXT: [[MUL12:%.*]] = mul nsw i32 [[DIV11]], 1 +// CHECK12-NEXT: [[ADD13:%.*]] = add nsw i32 0, [[MUL12]] +// CHECK12-NEXT: store i32 [[ADD13]], i32* [[I3]], align 4 +// CHECK12-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK12: .omp.final.done: +// CHECK12-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0 +// CHECK12-NEXT: br i1 [[TMP32]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK12: .omp.lastprivate.then: +// CHECK12-NEXT: [[TMP33:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP33]], i32* [[L_ADDR]], align 4 +// CHECK12-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK12: .omp.lastprivate.done: +// CHECK12-NEXT: br label [[OMP_PRECOND_END]] +// CHECK12: omp.precond.end: +// CHECK12-NEXT: ret void +// CHECK12-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44 +// CHECK12-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK12-NEXT: entry: +// CHECK12-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK12-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK12-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK12-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK12-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK12-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK12-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK12: .execute: +// CHECK12-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK12-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK12-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK12-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK12-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] +// CHECK12-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK12: .omp.deinit: +// CHECK12-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK12-NEXT: br label [[DOTEXIT:%.*]] +// CHECK12: .exit: +// CHECK12-NEXT: ret void +// CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK12-NEXT: entry: +// CHECK12-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK12-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK12-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK12-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK12-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK12-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK12-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK12-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK12-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK12-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK12-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK12: omp.precond.then: +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK12-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK12-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK12-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK12-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK12-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK12-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK12: cond.true: +// CHECK12-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK12-NEXT: br label [[COND_END:%.*]] +// CHECK12: cond.false: +// CHECK12-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: br label [[COND_END]] +// CHECK12: cond.end: +// CHECK12-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK12-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK12: omp.inner.for.cond: +// CHECK12-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK12-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK12-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK12-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK12: omp.inner.for.body: +// CHECK12-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 +// CHECK12-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK12-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK12-NEXT: [[TMP19:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK12-NEXT: store i8* [[TMP19]], i8** [[TMP18]], align 4 +// CHECK12-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK12-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK12-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 +// CHECK12-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK12-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK12-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 +// CHECK12-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK12-NEXT: [[TMP25:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* +// CHECK12-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 +// CHECK12-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK12-NEXT: [[TMP28:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK12-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP27]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP28]], i32 4) +// CHECK12-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK12: omp.inner.for.inc: +// CHECK12-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP29]], [[TMP30]] +// CHECK12-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] +// CHECK12-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK12-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK12-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] +// CHECK12-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] +// CHECK12: cond.true10: +// CHECK12-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK12-NEXT: br label [[COND_END12:%.*]] +// CHECK12: cond.false11: +// CHECK12-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: br label [[COND_END12]] +// CHECK12: cond.end12: +// CHECK12-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP37]], [[COND_TRUE10]] ], [ [[TMP38]], [[COND_FALSE11]] ] +// CHECK12-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP39]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]] +// CHECK12: omp.inner.for.end: +// CHECK12-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK12: omp.loop.exit: +// CHECK12-NEXT: [[TMP40:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 +// CHECK12-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]]) +// CHECK12-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[TMP43:%.*]] = icmp ne i32 [[TMP42]], 0 +// CHECK12-NEXT: br i1 [[TMP43]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK12: .omp.final.then: +// CHECK12-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[SUB14:%.*]] = sub nsw i32 [[TMP44]], 0 +// CHECK12-NEXT: [[DIV15:%.*]] = sdiv i32 [[SUB14]], 1 +// CHECK12-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV15]], 1 +// CHECK12-NEXT: [[ADD16:%.*]] = add nsw i32 0, [[MUL]] +// CHECK12-NEXT: store i32 [[ADD16]], i32* [[I3]], align 4 +// CHECK12-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK12: .omp.final.done: +// CHECK12-NEXT: br label [[OMP_PRECOND_END]] +// CHECK12: omp.precond.end: +// CHECK12-NEXT: ret void +// CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK12-NEXT: entry: +// CHECK12-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK12-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK12-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK12-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK12-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK12-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK12-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK12-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK12-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK12-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK12: omp.precond.then: +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK12-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK12-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK12-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK12-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK12-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK12-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK12-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK12: omp.inner.for.cond: +// CHECK12-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK12-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] +// CHECK12-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK12: omp.inner.for.body: +// CHECK12-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK12-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK12-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK12-NEXT: [[TMP13:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK12-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i32 0, i32 [[TMP13]] +// CHECK12-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +// CHECK12-NEXT: [[CONV:%.*]] = sext i16 [[TMP14]] to i32 +// CHECK12-NEXT: [[ADD5:%.*]] = add nsw i32 [[CONV]], 1 +// CHECK12-NEXT: [[CONV6:%.*]] = trunc i32 [[ADD5]] to i16 +// CHECK12-NEXT: store i16 [[CONV6]], i16* [[ARRAYIDX]], align 2 +// CHECK12-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK12: omp.body.continue: +// CHECK12-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK12: omp.inner.for.inc: +// CHECK12-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] +// CHECK12-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP14:![0-9]+]] +// CHECK12: omp.inner.for.end: +// CHECK12-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK12: omp.loop.exit: +// CHECK12-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 +// CHECK12-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) +// CHECK12-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 +// CHECK12-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK12: .omp.final.then: +// CHECK12-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP21]], 0 +// CHECK12-NEXT: [[DIV9:%.*]] = sdiv i32 [[SUB8]], 1 +// CHECK12-NEXT: [[MUL10:%.*]] = mul nsw i32 [[DIV9]], 1 +// CHECK12-NEXT: [[ADD11:%.*]] = add nsw i32 0, [[MUL10]] +// CHECK12-NEXT: store i32 [[ADD11]], i32* [[I3]], align 4 +// CHECK12-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK12: .omp.final.done: +// CHECK12-NEXT: br label [[OMP_PRECOND_END]] +// CHECK12: omp.precond.end: +// CHECK12-NEXT: ret void +// CHECK12-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 +// CHECK12-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK12-NEXT: entry: +// CHECK12-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK12-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK12-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK12-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK12-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK12-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK12-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK12: .execute: +// CHECK12-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK12-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK12-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] +// CHECK12-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK12: .omp.deinit: +// CHECK12-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK12-NEXT: br label [[DOTEXIT:%.*]] +// CHECK12: .exit: +// CHECK12-NEXT: ret void +// CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK12-NEXT: entry: +// CHECK12-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK12-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 +// CHECK12-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK12-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK12-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK12-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK12-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK12-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 +// CHECK12-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK12: cond.true: +// CHECK12-NEXT: br label [[COND_END:%.*]] +// CHECK12: cond.false: +// CHECK12-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: br label [[COND_END]] +// CHECK12: cond.end: +// CHECK12-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK12-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK12: omp.inner.for.cond: +// CHECK12-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 +// CHECK12-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK12: omp.inner.for.body: +// CHECK12-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK12-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK12-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK12-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK12-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK12-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK12-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK12-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK12-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK12-NEXT: [[TMP15:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK12-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP15]], i32 3) +// CHECK12-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK12: omp.inner.for.inc: +// CHECK12-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP17]] +// CHECK12-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] +// CHECK12-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK12-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP22]], 9 +// CHECK12-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] +// CHECK12: cond.true5: +// CHECK12-NEXT: br label [[COND_END7:%.*]] +// CHECK12: cond.false6: +// CHECK12-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: br label [[COND_END7]] +// CHECK12: cond.end7: +// CHECK12-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP23]], [[COND_FALSE6]] ] +// CHECK12-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP24]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP15:![0-9]+]] +// CHECK12: omp.inner.for.end: +// CHECK12-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK12: omp.loop.exit: +// CHECK12-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK12-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[TMP26:%.*]] = icmp ne i32 [[TMP25]], 0 +// CHECK12-NEXT: br i1 [[TMP26]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK12: .omp.final.then: +// CHECK12-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK12-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK12: .omp.final.done: +// CHECK12-NEXT: ret void +// CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__5 +// CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK12-NEXT: entry: +// CHECK12-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK12-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK12-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK12-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK12-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK12-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK12-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK12-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK12-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK12-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK12: omp.inner.for.cond: +// CHECK12-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK12-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK12-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK12: omp.inner.for.body: +// CHECK12-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 +// CHECK12-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK12-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK12-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 +// CHECK12-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 [[TMP9]] +// CHECK12-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK12-NEXT: [[ADD1:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK12-NEXT: store i32 [[ADD1]], i32* [[ARRAYIDX]], align 4 +// CHECK12-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK12: omp.body.continue: +// CHECK12-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK12: omp.inner.for.inc: +// CHECK12-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] +// CHECK12-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]] +// CHECK12: omp.inner.for.end: +// CHECK12-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK12: omp.loop.exit: +// CHECK12-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK12-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 +// CHECK12-NEXT: br i1 [[TMP14]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK12: .omp.final.then: +// CHECK12-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK12-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK12: .omp.final.done: +// CHECK12-NEXT: ret void +// CHECK12-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 +// CHECK12-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK12-NEXT: entry: +// CHECK12-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK12-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK12-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK12-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK12-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK12-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK12-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK12: .execute: +// CHECK12-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK12-NEXT: [[TMP2:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP2]], i32* [[F_CASTED]], align 4 +// CHECK12-NEXT: [[TMP3:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK12-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK12-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i32 [[TMP3]]) #[[ATTR2]] +// CHECK12-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK12: .omp.deinit: +// CHECK12-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK12-NEXT: br label [[DOTEXIT:%.*]] +// CHECK12: .exit: +// CHECK12-NEXT: ret void +// CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__6 +// CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK12-NEXT: entry: +// CHECK12-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK12-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK12-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK12-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK12-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK12-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK12-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK12-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 +// CHECK12-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK12: cond.true: +// CHECK12-NEXT: br label [[COND_END:%.*]] +// CHECK12: cond.false: +// CHECK12-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: br label [[COND_END]] +// CHECK12: cond.end: +// CHECK12-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK12-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK12: omp.inner.for.cond: +// CHECK12-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 +// CHECK12-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK12: omp.inner.for.body: +// CHECK12-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP9:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP9]], i32* [[F_CASTED]], align 4 +// CHECK12-NEXT: [[TMP10:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK12-NEXT: [[TMP11:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK12-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK12-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK12-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK12-NEXT: [[TMP14:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK12-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK12-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK12-NEXT: [[TMP16:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK12-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 4 +// CHECK12-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK12-NEXT: [[TMP18:%.*]] = inttoptr i32 [[TMP10]] to i8* +// CHECK12-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 4 +// CHECK12-NEXT: [[TMP19:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK12-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x [10 x i32]]*, i32)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP19]], i32 4) +// CHECK12-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK12: omp.inner.for.inc: +// CHECK12-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK12-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK12-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK12-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP26]], 99 +// CHECK12-NEXT: br i1 [[CMP5]], label [[COND_TRUE6:%.*]], label [[COND_FALSE7:%.*]] +// CHECK12: cond.true6: +// CHECK12-NEXT: br label [[COND_END8:%.*]] +// CHECK12: cond.false7: +// CHECK12-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: br label [[COND_END8]] +// CHECK12: cond.end8: +// CHECK12-NEXT: [[COND9:%.*]] = phi i32 [ 99, [[COND_TRUE6]] ], [ [[TMP27]], [[COND_FALSE7]] ] +// CHECK12-NEXT: store i32 [[COND9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP28]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]] +// CHECK12: omp.inner.for.end: +// CHECK12-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK12: omp.loop.exit: +// CHECK12-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK12-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0 +// CHECK12-NEXT: br i1 [[TMP30]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK12: .omp.final.then: +// CHECK12-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK12-NEXT: store i32 10, i32* [[J]], align 4 +// CHECK12-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK12: .omp.final.done: +// CHECK12-NEXT: ret void +// CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__7 +// CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK12-NEXT: entry: +// CHECK12-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK12-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK12-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK12-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK12-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 +// CHECK12-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK12-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK12-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK12-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK12-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK12: omp.inner.for.cond: +// CHECK12-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK12-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK12-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK12: omp.inner.for.body: +// CHECK12-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 +// CHECK12-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 +// CHECK12-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK12-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK12-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[DIV2:%.*]] = sdiv i32 [[TMP10]], 10 +// CHECK12-NEXT: [[MUL3:%.*]] = mul nsw i32 [[DIV2]], 10 +// CHECK12-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL3]] +// CHECK12-NEXT: [[MUL4:%.*]] = mul nsw i32 [[SUB]], 1 +// CHECK12-NEXT: [[ADD5:%.*]] = add nsw i32 0, [[MUL4]] +// CHECK12-NEXT: store i32 [[ADD5]], i32* [[J]], align 4 +// CHECK12-NEXT: store i32 10, i32* [[K]], align 4 +// CHECK12-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 +// CHECK12-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 +// CHECK12-NEXT: [[TMP13:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK12-NEXT: [[MUL6:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] +// CHECK12-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP11]], [[MUL6]] +// CHECK12-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 +// CHECK12-NEXT: [[ADD8:%.*]] = add nsw i32 [[ADD7]], [[TMP14]] +// CHECK12-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 +// CHECK12-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP15]] +// CHECK12-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 +// CHECK12-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP16]] +// CHECK12-NEXT: store i32 [[ADD8]], i32* [[ARRAYIDX9]], align 4 +// CHECK12-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK12: omp.body.continue: +// CHECK12-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK12: omp.inner.for.inc: +// CHECK12-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK12-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP18:![0-9]+]] +// CHECK12: omp.inner.for.end: +// CHECK12-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK12: omp.loop.exit: +// CHECK12-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK12-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 +// CHECK12-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK12: .omp.final.then: +// CHECK12-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK12-NEXT: store i32 10, i32* [[J]], align 4 +// CHECK12-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK12: .omp.final.done: +// CHECK12-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l32 +// CHECK1-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK1-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[L_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK1-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG10:![0-9]+]] +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK1: .execute: +// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[CONV2:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK1-NEXT: store i32 [[TMP2]], i32* [[CONV2]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK1-NEXT: [[CONV3:%.*]] = bitcast i64* [[L_CASTED]] to i32* +// CHECK1-NEXT: store i32 [[TMP4]], i32* [[CONV3]], align 4 +// CHECK1-NEXT: [[TMP5:%.*]] = load i64, i64* [[L_CASTED]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK1-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i32]* [[TMP0]], i64 [[TMP5]]) #[[ATTR2:[0-9]+]] +// CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK1: .omp.deinit: +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void // // -// CHECK8-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 -// CHECK8-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { -// CHECK8-NEXT: entry: -// CHECK8-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 -// CHECK8-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 -// CHECK8-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8 -// CHECK8-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK8-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK8-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 -// CHECK8-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK8-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* -// CHECK8-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK8-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK8-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK8: .execute: -// CHECK8-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK8-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK8-NEXT: [[CONV1:%.*]] = bitcast i64* [[F_CASTED]] to i32* -// CHECK8-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 -// CHECK8-NEXT: [[TMP3:%.*]] = load i64, i64* [[F_CASTED]], align 8 -// CHECK8-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK8-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i64 [[TMP3]]) #[[ATTR2]] -// CHECK8-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK8: .omp.deinit: -// CHECK8-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK8-NEXT: br label [[DOTEXIT:%.*]] -// CHECK8: .exit: -// CHECK8-NEXT: ret void -// -// -// CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__6 -// CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { -// CHECK8-NEXT: entry: -// CHECK8-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK8-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK8-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 -// CHECK8-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 -// CHECK8-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[K:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8 -// CHECK8-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 -// CHECK8-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK8-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK8-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK8-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 -// CHECK8-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK8-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* -// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK8-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK8-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK8-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK8-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK8-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 -// CHECK8-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK8-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK8-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 -// CHECK8-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK8: cond.true: -// CHECK8-NEXT: br label [[COND_END:%.*]] -// CHECK8: cond.false: -// CHECK8-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK8-NEXT: br label [[COND_END]] -// CHECK8: cond.end: -// CHECK8-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] -// CHECK8-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK8-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK8-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK8: omp.inner.for.cond: -// CHECK8-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 -// CHECK8-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK8: omp.inner.for.body: -// CHECK8-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK8-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 -// CHECK8-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK8-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 -// CHECK8-NEXT: [[TMP11:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK8-NEXT: [[CONV3:%.*]] = bitcast i64* [[F_CASTED]] to i32* -// CHECK8-NEXT: store i32 [[TMP11]], i32* [[CONV3]], align 4 -// CHECK8-NEXT: [[TMP12:%.*]] = load i64, i64* [[F_CASTED]], align 8 -// CHECK8-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK8-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP8]] to i8* -// CHECK8-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 -// CHECK8-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK8-NEXT: [[TMP16:%.*]] = inttoptr i64 [[TMP10]] to i8* -// CHECK8-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 -// CHECK8-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK8-NEXT: [[TMP18:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* -// CHECK8-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 8 -// CHECK8-NEXT: [[TMP19:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 -// CHECK8-NEXT: [[TMP20:%.*]] = inttoptr i64 [[TMP12]] to i8* -// CHECK8-NEXT: store i8* [[TMP20]], i8** [[TMP19]], align 8 -// CHECK8-NEXT: [[TMP21:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK8-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, [10 x [10 x i32]]*, i64)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP21]], i64 4) -// CHECK8-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK8: omp.inner.for.inc: -// CHECK8-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK8-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK8-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK8-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK8-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] -// CHECK8-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK8-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK8-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK8-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP26]], [[TMP27]] -// CHECK8-NEXT: store i32 [[ADD5]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK8-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK8-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP28]], 99 -// CHECK8-NEXT: br i1 [[CMP6]], label [[COND_TRUE7:%.*]], label [[COND_FALSE8:%.*]] -// CHECK8: cond.true7: -// CHECK8-NEXT: br label [[COND_END9:%.*]] -// CHECK8: cond.false8: -// CHECK8-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK8-NEXT: br label [[COND_END9]] -// CHECK8: cond.end9: -// CHECK8-NEXT: [[COND10:%.*]] = phi i32 [ 99, [[COND_TRUE7]] ], [ [[TMP29]], [[COND_FALSE8]] ] -// CHECK8-NEXT: store i32 [[COND10]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK8-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK8-NEXT: store i32 [[TMP30]], i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]] -// CHECK8: omp.inner.for.end: -// CHECK8-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK8: omp.loop.exit: -// CHECK8-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) -// CHECK8-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK8-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0 -// CHECK8-NEXT: br i1 [[TMP32]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK8: .omp.final.then: -// CHECK8-NEXT: store i32 10, i32* [[I]], align 4 -// CHECK8-NEXT: store i32 10, i32* [[J]], align 4 -// CHECK8-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK8: .omp.final.done: -// CHECK8-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK1-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[L_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 8 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK1-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* +// CHECK1-NEXT: [[L2:%.*]] = call i8* @__kmpc_alloc_shared(i64 4) +// CHECK1-NEXT: [[L_ON_STACK:%.*]] = bitcast i8* [[L2]] to i32* +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK1-NEXT: [[SUB4:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK1-NEXT: store i32 [[SUB4]], i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK1: omp.precond.then: +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK1-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK1-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK1-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK1: cond.true: +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK1-NEXT: br label [[COND_END:%.*]] +// CHECK1: cond.false: +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END]] +// CHECK1: cond.end: +// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK1-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK1-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK1-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 +// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[CONV8:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK1-NEXT: store i32 [[TMP18]], i32* [[CONV8]], align 4 +// CHECK1-NEXT: [[TMP19:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK1-NEXT: [[TMP20:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK1-NEXT: [[CONV9:%.*]] = bitcast i64* [[L_CASTED]] to i32* +// CHECK1-NEXT: store i32 [[TMP20]], i32* [[CONV9]], align 4 +// CHECK1-NEXT: [[TMP21:%.*]] = load i64, i64* [[L_CASTED]], align 8 +// CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP23:%.*]] = inttoptr i64 [[TMP15]] to i8* +// CHECK1-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 8 +// CHECK1-NEXT: [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP17]] to i8* +// CHECK1-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 8 +// CHECK1-NEXT: [[TMP26:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK1-NEXT: [[TMP27:%.*]] = inttoptr i64 [[TMP19]] to i8* +// CHECK1-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 8 +// CHECK1-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK1-NEXT: [[TMP29:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK1-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 8 +// CHECK1-NEXT: [[TMP30:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 4 +// CHECK1-NEXT: [[TMP31:%.*]] = inttoptr i64 [[TMP21]] to i8* +// CHECK1-NEXT: store i8* [[TMP31]], i8** [[TMP30]], align 8 +// CHECK1-NEXT: [[TMP32:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP33:%.*]] = load i32, i32* [[TMP32]], align 4 +// CHECK1-NEXT: [[TMP34:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP33]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i32]*, i64)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP34]], i64 5) +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] +// CHECK1-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP37]], [[TMP38]] +// CHECK1-NEXT: store i32 [[ADD11]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD12:%.*]] = add nsw i32 [[TMP39]], [[TMP40]] +// CHECK1-NEXT: store i32 [[ADD12]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK1-NEXT: [[CMP13:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]] +// CHECK1-NEXT: br i1 [[CMP13]], label [[COND_TRUE14:%.*]], label [[COND_FALSE15:%.*]] +// CHECK1: cond.true14: +// CHECK1-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK1-NEXT: br label [[COND_END16:%.*]] +// CHECK1: cond.false15: +// CHECK1-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END16]] +// CHECK1: cond.end16: +// CHECK1-NEXT: [[COND17:%.*]] = phi i32 [ [[TMP43]], [[COND_TRUE14]] ], [ [[TMP44]], [[COND_FALSE15]] ] +// CHECK1-NEXT: store i32 [[COND17]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP45]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP11:![0-9]+]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: [[TMP46:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP47:%.*]] = load i32, i32* [[TMP46]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP47]]) +// CHECK1-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP49:%.*]] = icmp ne i32 [[TMP48]], 0 +// CHECK1-NEXT: br i1 [[TMP49]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK1: .omp.final.then: +// CHECK1-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[SUB18:%.*]] = sub nsw i32 [[TMP50]], 0 +// CHECK1-NEXT: [[DIV19:%.*]] = sdiv i32 [[SUB18]], 1 +// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV19]], 1 +// CHECK1-NEXT: [[ADD20:%.*]] = add nsw i32 0, [[MUL]] +// CHECK1-NEXT: store i32 [[ADD20]], i32* [[I5]], align 4 +// CHECK1-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK1: .omp.final.done: +// CHECK1-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP52:%.*]] = icmp ne i32 [[TMP51]], 0 +// CHECK1-NEXT: br i1 [[TMP52]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK1: .omp.lastprivate.then: +// CHECK1-NEXT: [[TMP53:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK1-NEXT: store i32 [[TMP53]], i32* [[CONV1]], align 8 +// CHECK1-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK1: .omp.lastprivate.done: +// CHECK1-NEXT: br label [[OMP_PRECOND_END]] +// CHECK1: omp.precond.end: +// CHECK1-NEXT: call void @__kmpc_free_shared(i8* [[L2]]) +// CHECK1-NEXT: ret void // // -// CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__7 -// CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { -// CHECK8-NEXT: entry: -// CHECK8-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK8-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK8-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 -// CHECK8-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 -// CHECK8-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 -// CHECK8-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 -// CHECK8-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[K:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK8-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK8-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK8-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK8-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK8-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 -// CHECK8-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK8-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* -// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK8-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 -// CHECK8-NEXT: [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK8-NEXT: [[CONV2:%.*]] = trunc i64 [[TMP1]] to i32 -// CHECK8-NEXT: [[TMP2:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK8-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP2]] to i32 -// CHECK8-NEXT: store i32 [[CONV2]], i32* [[DOTOMP_LB]], align 4 -// CHECK8-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_UB]], align 4 -// CHECK8-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK8-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK8-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK8-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 -// CHECK8-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK8-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK8-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK8: omp.inner.for.cond: -// CHECK8-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: [[CONV4:%.*]] = sext i32 [[TMP6]] to i64 -// CHECK8-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK8-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV4]], [[TMP7]] -// CHECK8-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK8: omp.inner.for.body: -// CHECK8-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 -// CHECK8-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 -// CHECK8-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK8-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK8-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: [[DIV5:%.*]] = sdiv i32 [[TMP10]], 10 -// CHECK8-NEXT: [[MUL6:%.*]] = mul nsw i32 [[DIV5]], 10 -// CHECK8-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL6]] -// CHECK8-NEXT: [[MUL7:%.*]] = mul nsw i32 [[SUB]], 1 -// CHECK8-NEXT: [[ADD8:%.*]] = add nsw i32 0, [[MUL7]] -// CHECK8-NEXT: store i32 [[ADD8]], i32* [[J]], align 4 -// CHECK8-NEXT: store i32 10, i32* [[K]], align 4 -// CHECK8-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 -// CHECK8-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 -// CHECK8-NEXT: [[TMP13:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK8-NEXT: [[MUL9:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] -// CHECK8-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP11]], [[MUL9]] -// CHECK8-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 -// CHECK8-NEXT: [[ADD11:%.*]] = add nsw i32 [[ADD10]], [[TMP14]] -// CHECK8-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 -// CHECK8-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP15]] to i64 -// CHECK8-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i64 0, i64 [[IDXPROM]] -// CHECK8-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 -// CHECK8-NEXT: [[IDXPROM12:%.*]] = sext i32 [[TMP16]] to i64 -// CHECK8-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i64 0, i64 [[IDXPROM12]] -// CHECK8-NEXT: store i32 [[ADD11]], i32* [[ARRAYIDX13]], align 4 -// CHECK8-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK8: omp.body.continue: -// CHECK8-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK8: omp.inner.for.inc: -// CHECK8-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK8-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] -// CHECK8-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_IV]], align 4 -// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP18:![0-9]+]] -// CHECK8: omp.inner.for.end: -// CHECK8-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK8: omp.loop.exit: -// CHECK8-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) -// CHECK8-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK8-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 -// CHECK8-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK8: .omp.final.then: -// CHECK8-NEXT: store i32 10, i32* [[I]], align 4 -// CHECK8-NEXT: store i32 10, i32* [[J]], align 4 -// CHECK8-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK8: .omp.final.done: -// CHECK8-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK1-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I6:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK1-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK1-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK1-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK1: omp.precond.then: +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK1-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK1-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP5]] to i32 +// CHECK1-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: [[CONV5:%.*]] = trunc i64 [[TMP6]] to i32 +// CHECK1-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[CONV5]], i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) +// CHECK1-NEXT: br label [[OMP_DISPATCH_COND:%.*]] +// CHECK1: omp.dispatch.cond: +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[CONV7:%.*]] = sext i32 [[TMP9]] to i64 +// CHECK1-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: [[CMP8:%.*]] = icmp ugt i64 [[CONV7]], [[TMP10]] +// CHECK1-NEXT: br i1 [[CMP8]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK1: cond.true: +// CHECK1-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: br label [[COND_END:%.*]] +// CHECK1: cond.false: +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[CONV9:%.*]] = sext i32 [[TMP12]] to i64 +// CHECK1-NEXT: br label [[COND_END]] +// CHECK1: cond.end: +// CHECK1-NEXT: [[COND:%.*]] = phi i64 [ [[TMP11]], [[COND_TRUE]] ], [ [[CONV9]], [[COND_FALSE]] ] +// CHECK1-NEXT: [[CONV10:%.*]] = trunc i64 [[COND]] to i32 +// CHECK1-NEXT: store i32 [[CONV10]], i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[CMP11:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] +// CHECK1-NEXT: br i1 [[CMP11]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] +// CHECK1: omp.dispatch.body: +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[CMP12:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] +// CHECK1-NEXT: br i1 [[CMP12]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK1-NEXT: store i32 [[ADD]], i32* [[I6]], align 4 +// CHECK1-NEXT: [[TMP19:%.*]] = load i32, i32* [[I6]], align 4 +// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64 +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK1-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 +// CHECK1-NEXT: [[TMP20:%.*]] = load i32, i32* [[I6]], align 4 +// CHECK1-NEXT: store i32 [[TMP20]], i32* [[CONV1]], align 8 +// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK1: omp.body.continue: +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[ADD13:%.*]] = add nsw i32 [[TMP21]], 1 +// CHECK1-NEXT: store i32 [[ADD13]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_DISPATCH_INC:%.*]] +// CHECK1: omp.dispatch.inc: +// CHECK1-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK1-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD15:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK1-NEXT: store i32 [[ADD15]], i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: br label [[OMP_DISPATCH_COND]] +// CHECK1: omp.dispatch.end: +// CHECK1-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) +// CHECK1-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 +// CHECK1-NEXT: br i1 [[TMP29]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK1: .omp.final.then: +// CHECK1-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[SUB16:%.*]] = sub nsw i32 [[TMP30]], 0 +// CHECK1-NEXT: [[DIV17:%.*]] = sdiv i32 [[SUB16]], 1 +// CHECK1-NEXT: [[MUL18:%.*]] = mul nsw i32 [[DIV17]], 1 +// CHECK1-NEXT: [[ADD19:%.*]] = add nsw i32 0, [[MUL18]] +// CHECK1-NEXT: store i32 [[ADD19]], i32* [[I6]], align 4 +// CHECK1-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK1: .omp.final.done: +// CHECK1-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0 +// CHECK1-NEXT: br i1 [[TMP32]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK1: .omp.lastprivate.then: +// CHECK1-NEXT: [[TMP33:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK1-NEXT: store i32 [[TMP33]], i32* [[CONV1]], align 8 +// CHECK1-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK1: .omp.lastprivate.done: +// CHECK1-NEXT: br label [[OMP_PRECOND_END]] +// CHECK1: omp.precond.end: +// CHECK1-NEXT: ret void // // -// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38 -// CHECK9-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK9-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK9-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK9-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK9-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 -// CHECK9-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK9-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK9-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK9-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK9: .execute: -// CHECK9-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) -// CHECK9-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK9-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 -// CHECK9-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK9-NEXT: [[TMP4:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK9-NEXT: store i32 [[TMP4]], i32* [[L_CASTED]], align 4 -// CHECK9-NEXT: [[TMP5:%.*]] = load i32, i32* [[L_CASTED]], align 4 -// CHECK9-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK9-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32 [[TMP5]]) #[[ATTR2:[0-9]+]] -// CHECK9-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK9: .omp.deinit: -// CHECK9-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK9-NEXT: br label [[DOTEXIT:%.*]] -// CHECK9: .exit: -// CHECK9-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38 +// CHECK1-SAME: (i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 +// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK1-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG10]] +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK1: .execute: +// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK1-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK1-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] +// CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK1: .omp.deinit: +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK1-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK1-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK1-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK1: omp.precond.then: +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG10]] +// CHECK1-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK1-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK1: cond.true: +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: br label [[COND_END:%.*]] +// CHECK1: cond.false: +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END]] +// CHECK1: cond.end: +// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK1-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK1-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK1-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 +// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[CONV6:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK1-NEXT: store i32 [[TMP18]], i32* [[CONV6]], align 4 +// CHECK1-NEXT: [[TMP19:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK1-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP21:%.*]] = inttoptr i64 [[TMP15]] to i8* +// CHECK1-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 8 +// CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP23:%.*]] = inttoptr i64 [[TMP17]] to i8* +// CHECK1-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 8 +// CHECK1-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK1-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP19]] to i8* +// CHECK1-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 8 +// CHECK1-NEXT: [[TMP26:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK1-NEXT: [[TMP27:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* +// CHECK1-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 8 +// CHECK1-NEXT: [[TMP28:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP29:%.*]] = load i32, i32* [[TMP28]], align 4 +// CHECK1-NEXT: [[TMP30:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP29]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP30]], i64 4) +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] +// CHECK1-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK1-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] +// CHECK1-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP37]], [[TMP38]] +// CHECK1-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK1: cond.true11: +// CHECK1-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: br label [[COND_END13:%.*]] +// CHECK1: cond.false12: +// CHECK1-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END13]] +// CHECK1: cond.end13: +// CHECK1-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP39]], [[COND_TRUE11]] ], [ [[TMP40]], [[COND_FALSE12]] ] +// CHECK1-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP41]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP14:![0-9]+]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: [[TMP42:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP43:%.*]] = load i32, i32* [[TMP42]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP43]]) +// CHECK1-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP45:%.*]] = icmp ne i32 [[TMP44]], 0 +// CHECK1-NEXT: br i1 [[TMP45]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK1: .omp.final.then: +// CHECK1-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP46]], 0 +// CHECK1-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1 +// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV16]], 1 +// CHECK1-NEXT: [[ADD17:%.*]] = add nsw i32 0, [[MUL]] +// CHECK1-NEXT: store i32 [[ADD17]], i32* [[I3]], align 4 +// CHECK1-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK1: .omp.final.done: +// CHECK1-NEXT: br label [[OMP_PRECOND_END]] +// CHECK1: omp.precond.end: +// CHECK1-NEXT: ret void // // -// CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK9-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK9-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[I4:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 -// CHECK9-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK9-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK9-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK9-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK9-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 -// CHECK9-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK9-NEXT: [[TMP1:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK9-NEXT: [[TMP2:%.*]] = load i32, i32* @"_openmp_static_kernel$size", align 4 -// CHECK9-NEXT: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP2]], i16 [[TMP1]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) -// CHECK9-NEXT: [[TMP3:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4 -// CHECK9-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP3]], i32 0 -// CHECK9-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to %struct._globalized_locals_ty* -// CHECK9-NEXT: [[L1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP5]], i32 0, i32 0 -// CHECK9-NEXT: [[TMP6:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK9-NEXT: store i32 [[TMP6]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK9-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK9-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP7]], 0 -// CHECK9-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK9-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK9-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK9-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK9-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK9-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP8]] -// CHECK9-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK9: omp.precond.then: -// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK9-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK9-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK9-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK9-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK9-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 -// CHECK9-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP11]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) -// CHECK9-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK9-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK9-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]] -// CHECK9-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK9: cond.true: -// CHECK9-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK9-NEXT: br label [[COND_END:%.*]] -// CHECK9: cond.false: -// CHECK9-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK9-NEXT: br label [[COND_END]] -// CHECK9: cond.end: -// CHECK9-NEXT: [[COND:%.*]] = phi i32 [ [[TMP14]], [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ] -// CHECK9-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK9-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK9-NEXT: store i32 [[TMP16]], i32* [[DOTOMP_IV]], align 4 -// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK9: omp.inner.for.cond: -// CHECK9-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK9-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], 1 -// CHECK9-NEXT: [[CMP6:%.*]] = icmp slt i32 [[TMP17]], [[ADD]] -// CHECK9-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK9: omp.inner.for.body: -// CHECK9-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK9-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK9-NEXT: [[TMP21:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK9-NEXT: store i32 [[TMP21]], i32* [[N_CASTED]], align 4 -// CHECK9-NEXT: [[TMP22:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK9-NEXT: [[TMP23:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK9-NEXT: store i32 [[TMP23]], i32* [[L_CASTED]], align 4 -// CHECK9-NEXT: [[TMP24:%.*]] = load i32, i32* [[L_CASTED]], align 4 -// CHECK9-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK9-NEXT: [[TMP26:%.*]] = inttoptr i32 [[TMP19]] to i8* -// CHECK9-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 -// CHECK9-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK9-NEXT: [[TMP28:%.*]] = inttoptr i32 [[TMP20]] to i8* -// CHECK9-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 -// CHECK9-NEXT: [[TMP29:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK9-NEXT: [[TMP30:%.*]] = inttoptr i32 [[TMP22]] to i8* -// CHECK9-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 4 -// CHECK9-NEXT: [[TMP31:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK9-NEXT: [[TMP32:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* -// CHECK9-NEXT: store i8* [[TMP32]], i8** [[TMP31]], align 4 -// CHECK9-NEXT: [[TMP33:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 -// CHECK9-NEXT: [[TMP34:%.*]] = inttoptr i32 [[TMP24]] to i8* -// CHECK9-NEXT: store i8* [[TMP34]], i8** [[TMP33]], align 4 -// CHECK9-NEXT: [[TMP35:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK9-NEXT: [[TMP36:%.*]] = load i32, i32* [[TMP35]], align 4 -// CHECK9-NEXT: [[TMP37:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK9-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP36]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP37]], i32 5) -// CHECK9-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK9: omp.inner.for.inc: -// CHECK9-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK9-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK9-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP38]], [[TMP39]] -// CHECK9-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK9-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK9-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK9-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP40]], [[TMP41]] -// CHECK9-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK9-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK9-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK9-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP42]], [[TMP43]] -// CHECK9-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK9-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK9-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK9-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP44]], [[TMP45]] -// CHECK9-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] -// CHECK9: cond.true11: -// CHECK9-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK9-NEXT: br label [[COND_END13:%.*]] -// CHECK9: cond.false12: -// CHECK9-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK9-NEXT: br label [[COND_END13]] -// CHECK9: cond.end13: -// CHECK9-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP46]], [[COND_TRUE11]] ], [ [[TMP47]], [[COND_FALSE12]] ] -// CHECK9-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK9-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK9-NEXT: store i32 [[TMP48]], i32* [[DOTOMP_IV]], align 4 -// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP10:![0-9]+]] -// CHECK9: omp.inner.for.end: -// CHECK9-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK9: omp.loop.exit: -// CHECK9-NEXT: [[TMP49:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK9-NEXT: [[TMP50:%.*]] = load i32, i32* [[TMP49]], align 4 -// CHECK9-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP50]]) -// CHECK9-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK9-NEXT: [[TMP52:%.*]] = icmp ne i32 [[TMP51]], 0 -// CHECK9-NEXT: br i1 [[TMP52]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK9: .omp.final.then: -// CHECK9-NEXT: [[TMP53:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK9-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP53]], 0 -// CHECK9-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1 -// CHECK9-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV16]], 1 -// CHECK9-NEXT: [[ADD17:%.*]] = add nsw i32 0, [[MUL]] -// CHECK9-NEXT: store i32 [[ADD17]], i32* [[I4]], align 4 -// CHECK9-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK9: .omp.final.done: -// CHECK9-NEXT: [[TMP54:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK9-NEXT: [[TMP55:%.*]] = icmp ne i32 [[TMP54]], 0 -// CHECK9-NEXT: br i1 [[TMP55]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK9: .omp.lastprivate.then: -// CHECK9-NEXT: [[TMP56:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK9-NEXT: store i32 [[TMP56]], i32* [[L_ADDR]], align 4 -// CHECK9-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK9: .omp.lastprivate.done: -// CHECK9-NEXT: br label [[OMP_PRECOND_END]] -// CHECK9: omp.precond.end: -// CHECK9-NEXT: [[TMP57:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK9-NEXT: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[TMP57]]) -// CHECK9-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK1-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK1-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK1-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK1: omp.precond.then: +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK1-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP5]] to i32 +// CHECK1-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP6]] to i32 +// CHECK1-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[CONV6:%.*]] = sext i32 [[TMP10]] to i64 +// CHECK1-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: [[CMP7:%.*]] = icmp ule i64 [[CONV6]], [[TMP11]] +// CHECK1-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK1-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 +// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP13]] to i64 +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK1-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +// CHECK1-NEXT: [[CONV8:%.*]] = sext i16 [[TMP14]] to i32 +// CHECK1-NEXT: [[ADD9:%.*]] = add nsw i32 [[CONV8]], 1 +// CHECK1-NEXT: [[CONV10:%.*]] = trunc i32 [[ADD9]] to i16 +// CHECK1-NEXT: store i16 [[CONV10]], i16* [[ARRAYIDX]], align 2 +// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK1: omp.body.continue: +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] +// CHECK1-NEXT: store i32 [[ADD11]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP15:![0-9]+]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) +// CHECK1-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 +// CHECK1-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK1: .omp.final.then: +// CHECK1-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[SUB12:%.*]] = sub nsw i32 [[TMP21]], 0 +// CHECK1-NEXT: [[DIV13:%.*]] = sdiv i32 [[SUB12]], 1 +// CHECK1-NEXT: [[MUL14:%.*]] = mul nsw i32 [[DIV13]], 1 +// CHECK1-NEXT: [[ADD15:%.*]] = add nsw i32 0, [[MUL14]] +// CHECK1-NEXT: store i32 [[ADD15]], i32* [[I5]], align 4 +// CHECK1-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK1: .omp.final.done: +// CHECK1-NEXT: br label [[OMP_PRECOND_END]] +// CHECK1: omp.precond.end: +// CHECK1-NEXT: ret void // // -// CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK9-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK9-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK9-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK9-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK9-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK9-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK9-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK9-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 -// CHECK9-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK9-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK9-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK9-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK9-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK9-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK9-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK9-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK9-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK9-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK9-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK9-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK9: omp.precond.then: -// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK9-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK9-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK9-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK9-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK9-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 -// CHECK9-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 -// CHECK9-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK9-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK9-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK9-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) -// CHECK9-NEXT: br label [[OMP_DISPATCH_COND:%.*]] -// CHECK9: omp.dispatch.cond: -// CHECK9-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK9-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK9-NEXT: [[CMP4:%.*]] = icmp ugt i32 [[TMP9]], [[TMP10]] -// CHECK9-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK9: cond.true: -// CHECK9-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK9-NEXT: br label [[COND_END:%.*]] -// CHECK9: cond.false: -// CHECK9-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK9-NEXT: br label [[COND_END]] -// CHECK9: cond.end: -// CHECK9-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] -// CHECK9-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 -// CHECK9-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK9-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 -// CHECK9-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK9-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK9-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] -// CHECK9-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] -// CHECK9: omp.dispatch.body: -// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK9: omp.inner.for.cond: -// CHECK9-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK9-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK9-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] -// CHECK9-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK9: omp.inner.for.body: -// CHECK9-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK9-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 -// CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK9-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 -// CHECK9-NEXT: [[TMP19:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK9-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP19]] -// CHECK9-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 -// CHECK9-NEXT: [[TMP20:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK9-NEXT: store i32 [[TMP20]], i32* [[L_ADDR]], align 4 -// CHECK9-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK9: omp.body.continue: -// CHECK9-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK9: omp.inner.for.inc: -// CHECK9-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK9-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP21]], 1 -// CHECK9-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP12:![0-9]+]] -// CHECK9: omp.inner.for.end: -// CHECK9-NEXT: br label [[OMP_DISPATCH_INC:%.*]] -// CHECK9: omp.dispatch.inc: -// CHECK9-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK9-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK9-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK9-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_LB]], align 4 -// CHECK9-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK9-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK9-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] -// CHECK9-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_UB]], align 4 -// CHECK9-NEXT: br label [[OMP_DISPATCH_COND]] -// CHECK9: omp.dispatch.end: -// CHECK9-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK9-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 -// CHECK9-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) -// CHECK9-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK9-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 -// CHECK9-NEXT: br i1 [[TMP29]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK9: .omp.final.then: -// CHECK9-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK9-NEXT: [[SUB10:%.*]] = sub nsw i32 [[TMP30]], 0 -// CHECK9-NEXT: [[DIV11:%.*]] = sdiv i32 [[SUB10]], 1 -// CHECK9-NEXT: [[MUL12:%.*]] = mul nsw i32 [[DIV11]], 1 -// CHECK9-NEXT: [[ADD13:%.*]] = add nsw i32 0, [[MUL12]] -// CHECK9-NEXT: store i32 [[ADD13]], i32* [[I3]], align 4 -// CHECK9-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK9: .omp.final.done: -// CHECK9-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK9-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0 -// CHECK9-NEXT: br i1 [[TMP32]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK9: .omp.lastprivate.then: -// CHECK9-NEXT: [[TMP33:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK9-NEXT: store i32 [[TMP33]], i32* [[L_ADDR]], align 4 -// CHECK9-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK9: .omp.lastprivate.done: -// CHECK9-NEXT: br label [[OMP_PRECOND_END]] -// CHECK9: omp.precond.end: -// CHECK9-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43 +// CHECK1-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG10]] +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK1: .execute: +// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK1-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] +// CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK1: .omp.deinit: +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void // // -// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44 -// CHECK9-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 -// CHECK9-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK9-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK9-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK9-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK9-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK9-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK9-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK9: .execute: -// CHECK9-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK9-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK9-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 -// CHECK9-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK9-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK9-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] -// CHECK9-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK9: .omp.deinit: -// CHECK9-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK9-NEXT: br label [[DOTEXIT:%.*]] -// CHECK9: .exit: -// CHECK9-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 8 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG10]] +// CHECK1-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 +// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK1: cond.true: +// CHECK1-NEXT: br label [[COND_END:%.*]] +// CHECK1: cond.false: +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END]] +// CHECK1: cond.end: +// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK1-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 +// CHECK1-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP8]] to i8* +// CHECK1-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 8 +// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP10]] to i8* +// CHECK1-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 +// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK1-NEXT: [[TMP16:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK1-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 +// CHECK1-NEXT: [[TMP17:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP17]], i64 3) +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] +// CHECK1-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK1-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK1-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP24]], 9 +// CHECK1-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] +// CHECK1: cond.true5: +// CHECK1-NEXT: br label [[COND_END7:%.*]] +// CHECK1: cond.false6: +// CHECK1-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END7]] +// CHECK1: cond.end7: +// CHECK1-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP25]], [[COND_FALSE6]] ] +// CHECK1-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP26]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK1-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP28:%.*]] = icmp ne i32 [[TMP27]], 0 +// CHECK1-NEXT: br i1 [[TMP28]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK1: .omp.final.then: +// CHECK1-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK1-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK1: .omp.final.done: +// CHECK1-NEXT: ret void // // -// CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__2 -// CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK9-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 -// CHECK9-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 -// CHECK9-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK9-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK9-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK9-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK9-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK9-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK9-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK9-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK9-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK9-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK9-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK9-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK9-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK9-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK9-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK9-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK9: omp.precond.then: -// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK9-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK9-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK9-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK9-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK9-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK9-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 -// CHECK9-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK9-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK9-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK9-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] -// CHECK9-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK9: cond.true: -// CHECK9-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK9-NEXT: br label [[COND_END:%.*]] -// CHECK9: cond.false: -// CHECK9-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK9-NEXT: br label [[COND_END]] -// CHECK9: cond.end: -// CHECK9-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] -// CHECK9-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK9-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK9-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 -// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK9: omp.inner.for.cond: -// CHECK9-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK9-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 -// CHECK9-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] -// CHECK9-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK9: omp.inner.for.body: -// CHECK9-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK9-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK9-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK9-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 -// CHECK9-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK9-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK9-NEXT: [[TMP19:%.*]] = inttoptr i32 [[TMP14]] to i8* -// CHECK9-NEXT: store i8* [[TMP19]], i8** [[TMP18]], align 4 -// CHECK9-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK9-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP15]] to i8* -// CHECK9-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 -// CHECK9-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK9-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP17]] to i8* -// CHECK9-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 -// CHECK9-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK9-NEXT: [[TMP25:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* -// CHECK9-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 -// CHECK9-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK9-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 -// CHECK9-NEXT: [[TMP28:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK9-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP27]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP28]], i32 4) -// CHECK9-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK9: omp.inner.for.inc: -// CHECK9-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK9-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK9-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP29]], [[TMP30]] -// CHECK9-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 -// CHECK9-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK9-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK9-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] -// CHECK9-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK9-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK9-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK9-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] -// CHECK9-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK9-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK9-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK9-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] -// CHECK9-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] -// CHECK9: cond.true10: -// CHECK9-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK9-NEXT: br label [[COND_END12:%.*]] -// CHECK9: cond.false11: -// CHECK9-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK9-NEXT: br label [[COND_END12]] -// CHECK9: cond.end12: -// CHECK9-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP37]], [[COND_TRUE10]] ], [ [[TMP38]], [[COND_FALSE11]] ] -// CHECK9-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK9-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK9-NEXT: store i32 [[TMP39]], i32* [[DOTOMP_IV]], align 4 -// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]] -// CHECK9: omp.inner.for.end: -// CHECK9-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK9: omp.loop.exit: -// CHECK9-NEXT: [[TMP40:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK9-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 -// CHECK9-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]]) -// CHECK9-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK9-NEXT: [[TMP43:%.*]] = icmp ne i32 [[TMP42]], 0 -// CHECK9-NEXT: br i1 [[TMP43]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK9: .omp.final.then: -// CHECK9-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK9-NEXT: [[SUB14:%.*]] = sub nsw i32 [[TMP44]], 0 -// CHECK9-NEXT: [[DIV15:%.*]] = sdiv i32 [[SUB14]], 1 -// CHECK9-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV15]], 1 -// CHECK9-NEXT: [[ADD16:%.*]] = add nsw i32 0, [[MUL]] -// CHECK9-NEXT: store i32 [[ADD16]], i32* [[I3]], align 4 -// CHECK9-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK9: .omp.final.done: -// CHECK9-NEXT: br label [[OMP_PRECOND_END]] -// CHECK9: omp.precond.end: -// CHECK9-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__5 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = trunc i64 [[TMP1]] to i32 +// CHECK1-NEXT: [[TMP2:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: [[CONV1:%.*]] = trunc i64 [[TMP2]] to i32 +// CHECK1-NEXT: store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[CONV1]], i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[CONV2:%.*]] = sext i32 [[TMP6]] to i64 +// CHECK1-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV2]], [[TMP7]] +// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK1-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 +// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP9]] to i64 +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK1-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK1-NEXT: store i32 [[ADD3]], i32* [[ARRAYIDX]], align 4 +// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK1: omp.body.continue: +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] +// CHECK1-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 +// CHECK1-NEXT: br i1 [[TMP14]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK1: .omp.final.then: +// CHECK1-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK1-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK1: .omp.final.done: +// CHECK1-NEXT: ret void // // -// CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__3 -// CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK9-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 -// CHECK9-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK9-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK9-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK9-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK9-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK9-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK9-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK9-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK9-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK9-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK9-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK9-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK9-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK9-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK9-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK9-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK9-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK9-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK9: omp.precond.then: -// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK9-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK9-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK9-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK9-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK9-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 -// CHECK9-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 -// CHECK9-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK9-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK9-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK9-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK9-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK9-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 -// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK9: omp.inner.for.cond: -// CHECK9-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK9-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK9-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] -// CHECK9-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK9: omp.inner.for.body: -// CHECK9-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK9-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 -// CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK9-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 -// CHECK9-NEXT: [[TMP13:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK9-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i32 0, i32 [[TMP13]] -// CHECK9-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 -// CHECK9-NEXT: [[CONV:%.*]] = sext i16 [[TMP14]] to i32 -// CHECK9-NEXT: [[ADD5:%.*]] = add nsw i32 [[CONV]], 1 -// CHECK9-NEXT: [[CONV6:%.*]] = trunc i32 [[ADD5]] to i16 -// CHECK9-NEXT: store i16 [[CONV6]], i16* [[ARRAYIDX]], align 2 -// CHECK9-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK9: omp.body.continue: -// CHECK9-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK9: omp.inner.for.inc: -// CHECK9-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK9-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK9-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] -// CHECK9-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP14:![0-9]+]] -// CHECK9: omp.inner.for.end: -// CHECK9-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK9: omp.loop.exit: -// CHECK9-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK9-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 -// CHECK9-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) -// CHECK9-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK9-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 -// CHECK9-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK9: .omp.final.then: -// CHECK9-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK9-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP21]], 0 -// CHECK9-NEXT: [[DIV9:%.*]] = sdiv i32 [[SUB8]], 1 -// CHECK9-NEXT: [[MUL10:%.*]] = mul nsw i32 [[DIV9]], 1 -// CHECK9-NEXT: [[ADD11:%.*]] = add nsw i32 0, [[MUL10]] -// CHECK9-NEXT: store i32 [[ADD11]], i32* [[I3]], align 4 -// CHECK9-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK9: .omp.final.done: -// CHECK9-NEXT: br label [[OMP_PRECOND_END]] -// CHECK9: omp.precond.end: -// CHECK9-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l48 +// CHECK1-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK1-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG10]] +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK1: .execute: +// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[F_CASTED]] to i32* +// CHECK1-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i64, i64* [[F_CASTED]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK1-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i64 [[TMP3]]) #[[ATTR2]] +// CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK1: .omp.deinit: +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__6 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK1-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG10]] +// CHECK1-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 +// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK1: cond.true: +// CHECK1-NEXT: br label [[COND_END:%.*]] +// CHECK1: cond.false: +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END]] +// CHECK1: cond.end: +// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK1-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 +// CHECK1-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[CONV3:%.*]] = bitcast i64* [[F_CASTED]] to i32* +// CHECK1-NEXT: store i32 [[TMP11]], i32* [[CONV3]], align 4 +// CHECK1-NEXT: [[TMP12:%.*]] = load i64, i64* [[F_CASTED]], align 8 +// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP8]] to i8* +// CHECK1-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 +// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP16:%.*]] = inttoptr i64 [[TMP10]] to i8* +// CHECK1-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 +// CHECK1-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK1-NEXT: [[TMP18:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK1-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 8 +// CHECK1-NEXT: [[TMP19:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK1-NEXT: [[TMP20:%.*]] = inttoptr i64 [[TMP12]] to i8* +// CHECK1-NEXT: store i8* [[TMP20]], i8** [[TMP19]], align 8 +// CHECK1-NEXT: [[TMP21:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, [10 x [10 x i32]]*, i64)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP21]], i64 4) +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK1-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK1-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP26]], [[TMP27]] +// CHECK1-NEXT: store i32 [[ADD5]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP28]], 99 +// CHECK1-NEXT: br i1 [[CMP6]], label [[COND_TRUE7:%.*]], label [[COND_FALSE8:%.*]] +// CHECK1: cond.true7: +// CHECK1-NEXT: br label [[COND_END9:%.*]] +// CHECK1: cond.false8: +// CHECK1-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END9]] +// CHECK1: cond.end9: +// CHECK1-NEXT: [[COND10:%.*]] = phi i32 [ 99, [[COND_TRUE7]] ], [ [[TMP29]], [[COND_FALSE8]] ] +// CHECK1-NEXT: store i32 [[COND10]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP30]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP18:![0-9]+]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK1-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0 +// CHECK1-NEXT: br i1 [[TMP32]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK1: .omp.final.then: +// CHECK1-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK1-NEXT: store i32 10, i32* [[J]], align 4 +// CHECK1-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK1: .omp.final.done: +// CHECK1-NEXT: ret void // // -// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 -// CHECK9-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK9-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK9-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK9-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK9-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK9-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK9-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK9: .execute: -// CHECK9-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK9-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK9-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] -// CHECK9-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK9: .omp.deinit: -// CHECK9-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK9-NEXT: br label [[DOTEXIT:%.*]] -// CHECK9: .exit: -// CHECK9-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__7 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK1-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK1-NEXT: [[CONV2:%.*]] = trunc i64 [[TMP1]] to i32 +// CHECK1-NEXT: [[TMP2:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP2]] to i32 +// CHECK1-NEXT: store i32 [[CONV2]], i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[CONV4:%.*]] = sext i32 [[TMP6]] to i64 +// CHECK1-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV4]], [[TMP7]] +// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 +// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK1-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[DIV5:%.*]] = sdiv i32 [[TMP10]], 10 +// CHECK1-NEXT: [[MUL6:%.*]] = mul nsw i32 [[DIV5]], 10 +// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL6]] +// CHECK1-NEXT: [[MUL7:%.*]] = mul nsw i32 [[SUB]], 1 +// CHECK1-NEXT: [[ADD8:%.*]] = add nsw i32 0, [[MUL7]] +// CHECK1-NEXT: store i32 [[ADD8]], i32* [[J]], align 4 +// CHECK1-NEXT: store i32 10, i32* [[K]], align 4 +// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 +// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[MUL9:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] +// CHECK1-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP11]], [[MUL9]] +// CHECK1-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 +// CHECK1-NEXT: [[ADD11:%.*]] = add nsw i32 [[ADD10]], [[TMP14]] +// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 +// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP15]] to i64 +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 +// CHECK1-NEXT: [[IDXPROM12:%.*]] = sext i32 [[TMP16]] to i64 +// CHECK1-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i64 0, i64 [[IDXPROM12]] +// CHECK1-NEXT: store i32 [[ADD11]], i32* [[ARRAYIDX13]], align 4 +// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK1: omp.body.continue: +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK1-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK1-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 +// CHECK1-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK1: .omp.final.then: +// CHECK1-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK1-NEXT: store i32 10, i32* [[J]], align 4 +// CHECK1-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK1: .omp.final.done: +// CHECK1-NEXT: ret void // // -// CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__4 -// CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK9-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK9-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 -// CHECK9-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK9-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK9-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK9-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK9-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK9-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK9-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK9-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK9-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 -// CHECK9-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK9-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK9-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 -// CHECK9-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK9: cond.true: -// CHECK9-NEXT: br label [[COND_END:%.*]] -// CHECK9: cond.false: -// CHECK9-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK9-NEXT: br label [[COND_END]] -// CHECK9: cond.end: -// CHECK9-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] -// CHECK9-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK9-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK9-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK9: omp.inner.for.cond: -// CHECK9-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK9-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 -// CHECK9-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK9: omp.inner.for.body: -// CHECK9-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK9-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK9-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK9-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP7]] to i8* -// CHECK9-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 -// CHECK9-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK9-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP8]] to i8* -// CHECK9-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 -// CHECK9-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK9-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* -// CHECK9-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 -// CHECK9-NEXT: [[TMP15:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK9-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP15]], i32 3) -// CHECK9-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK9: omp.inner.for.inc: -// CHECK9-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK9-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP17]] -// CHECK9-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 -// CHECK9-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK9-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK9-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] -// CHECK9-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK9-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK9-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK9-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] -// CHECK9-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK9-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK9-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP22]], 9 -// CHECK9-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] -// CHECK9: cond.true5: -// CHECK9-NEXT: br label [[COND_END7:%.*]] -// CHECK9: cond.false6: -// CHECK9-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK9-NEXT: br label [[COND_END7]] -// CHECK9: cond.end7: -// CHECK9-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP23]], [[COND_FALSE6]] ] -// CHECK9-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK9-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK9-NEXT: store i32 [[TMP24]], i32* [[DOTOMP_IV]], align 4 -// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP15:![0-9]+]] -// CHECK9: omp.inner.for.end: -// CHECK9-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK9: omp.loop.exit: -// CHECK9-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) -// CHECK9-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK9-NEXT: [[TMP26:%.*]] = icmp ne i32 [[TMP25]], 0 -// CHECK9-NEXT: br i1 [[TMP26]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK9: .omp.final.then: -// CHECK9-NEXT: store i32 10, i32* [[I]], align 4 -// CHECK9-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK9: .omp.final.done: -// CHECK9-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l32 +// CHECK2-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK2-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK2-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG10:![0-9]+]] +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK2: .execute: +// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP4]], i32* [[L_CASTED]], align 4 +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK2-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32 [[TMP5]]) #[[ATTR2:[0-9]+]] +// CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK2: .omp.deinit: +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK2-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I4:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK2-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK2-NEXT: [[L1:%.*]] = call i8* @__kmpc_alloc_shared(i32 4) +// CHECK2-NEXT: [[L_ON_STACK:%.*]] = bitcast i8* [[L1]] to i32* +// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK2-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK2-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK2: omp.precond.then: +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK2-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK2: cond.true: +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: br label [[COND_END:%.*]] +// CHECK2: cond.false: +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END]] +// CHECK2: cond.end: +// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK2-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK2-NEXT: [[CMP6:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK2-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 +// CHECK2-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP18]], i32* [[L_CASTED]], align 4 +// CHECK2-NEXT: [[TMP19:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK2-NEXT: [[TMP20:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK2-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 +// CHECK2-NEXT: [[TMP22:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK2-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 +// CHECK2-NEXT: [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK2-NEXT: [[TMP25:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK2-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 +// CHECK2-NEXT: [[TMP26:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK2-NEXT: [[TMP27:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK2-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 4 +// CHECK2-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK2-NEXT: [[TMP29:%.*]] = inttoptr i32 [[TMP19]] to i8* +// CHECK2-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 4 +// CHECK2-NEXT: [[TMP30:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: [[TMP31:%.*]] = load i32, i32* [[TMP30]], align 4 +// CHECK2-NEXT: [[TMP32:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP31]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP32]], i32 5) +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK2-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] +// CHECK2-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP37]], [[TMP38]] +// CHECK2-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP39]], [[TMP40]] +// CHECK2-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK2: cond.true11: +// CHECK2-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: br label [[COND_END13:%.*]] +// CHECK2: cond.false12: +// CHECK2-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END13]] +// CHECK2: cond.end13: +// CHECK2-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP41]], [[COND_TRUE11]] ], [ [[TMP42]], [[COND_FALSE12]] ] +// CHECK2-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP43]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP11:![0-9]+]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK2: omp.loop.exit: +// CHECK2-NEXT: [[TMP44:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: [[TMP45:%.*]] = load i32, i32* [[TMP44]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP45]]) +// CHECK2-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP47:%.*]] = icmp ne i32 [[TMP46]], 0 +// CHECK2-NEXT: br i1 [[TMP47]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK2: .omp.final.then: +// CHECK2-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP48]], 0 +// CHECK2-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1 +// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV16]], 1 +// CHECK2-NEXT: [[ADD17:%.*]] = add nsw i32 0, [[MUL]] +// CHECK2-NEXT: store i32 [[ADD17]], i32* [[I4]], align 4 +// CHECK2-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK2: .omp.final.done: +// CHECK2-NEXT: [[TMP49:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP50:%.*]] = icmp ne i32 [[TMP49]], 0 +// CHECK2-NEXT: br i1 [[TMP50]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK2: .omp.lastprivate.then: +// CHECK2-NEXT: [[TMP51:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP51]], i32* [[L_ADDR]], align 4 +// CHECK2-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK2: .omp.lastprivate.done: +// CHECK2-NEXT: br label [[OMP_PRECOND_END]] +// CHECK2: omp.precond.end: +// CHECK2-NEXT: call void @__kmpc_free_shared(i8* [[L1]]) +// CHECK2-NEXT: ret void // // -// CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__5 -// CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK9-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK9-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK9-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK9-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK9-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK9-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK9-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK9-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 -// CHECK9-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK9-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK9-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 -// CHECK9-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 -// CHECK9-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK9-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK9-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 -// CHECK9-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK9-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK9-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK9: omp.inner.for.cond: -// CHECK9-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK9-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK9-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] -// CHECK9-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK9: omp.inner.for.body: -// CHECK9-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK9-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 -// CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK9-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK9-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 -// CHECK9-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 [[TMP9]] -// CHECK9-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -// CHECK9-NEXT: [[ADD1:%.*]] = add nsw i32 [[TMP10]], 1 -// CHECK9-NEXT: store i32 [[ADD1]], i32* [[ARRAYIDX]], align 4 -// CHECK9-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK9: omp.body.continue: -// CHECK9-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK9: omp.inner.for.inc: -// CHECK9-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK9-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK9-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] -// CHECK9-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 -// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]] -// CHECK9: omp.inner.for.end: -// CHECK9-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK9: omp.loop.exit: -// CHECK9-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) -// CHECK9-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK9-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 -// CHECK9-NEXT: br i1 [[TMP14]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK9: .omp.final.then: -// CHECK9-NEXT: store i32 10, i32* [[I]], align 4 -// CHECK9-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK9: .omp.final.done: -// CHECK9-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK2-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK2-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK2-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK2-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK2-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK2-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK2: omp.precond.then: +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) +// CHECK2-NEXT: br label [[OMP_DISPATCH_COND:%.*]] +// CHECK2: omp.dispatch.cond: +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK2-NEXT: [[CMP4:%.*]] = icmp ugt i32 [[TMP9]], [[TMP10]] +// CHECK2-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK2: cond.true: +// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK2-NEXT: br label [[COND_END:%.*]] +// CHECK2: cond.false: +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END]] +// CHECK2: cond.end: +// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] +// CHECK2-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] +// CHECK2-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] +// CHECK2: omp.dispatch.body: +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] +// CHECK2-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK2-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK2-NEXT: [[TMP19:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP19]] +// CHECK2-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 +// CHECK2-NEXT: [[TMP20:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK2-NEXT: store i32 [[TMP20]], i32* [[L_ADDR]], align 4 +// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK2: omp.body.continue: +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP21]], 1 +// CHECK2-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_DISPATCH_INC:%.*]] +// CHECK2: omp.dispatch.inc: +// CHECK2-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK2-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK2-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: br label [[OMP_DISPATCH_COND]] +// CHECK2: omp.dispatch.end: +// CHECK2-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) +// CHECK2-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 +// CHECK2-NEXT: br i1 [[TMP29]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK2: .omp.final.then: +// CHECK2-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[SUB10:%.*]] = sub nsw i32 [[TMP30]], 0 +// CHECK2-NEXT: [[DIV11:%.*]] = sdiv i32 [[SUB10]], 1 +// CHECK2-NEXT: [[MUL12:%.*]] = mul nsw i32 [[DIV11]], 1 +// CHECK2-NEXT: [[ADD13:%.*]] = add nsw i32 0, [[MUL12]] +// CHECK2-NEXT: store i32 [[ADD13]], i32* [[I3]], align 4 +// CHECK2-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK2: .omp.final.done: +// CHECK2-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0 +// CHECK2-NEXT: br i1 [[TMP32]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK2: .omp.lastprivate.then: +// CHECK2-NEXT: [[TMP33:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP33]], i32* [[L_ADDR]], align 4 +// CHECK2-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK2: .omp.lastprivate.done: +// CHECK2-NEXT: br label [[OMP_PRECOND_END]] +// CHECK2: omp.precond.end: +// CHECK2-NEXT: ret void // // -// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 -// CHECK9-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK9-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK9-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK9-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 -// CHECK9-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK9-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK9-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK9-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK9: .execute: -// CHECK9-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK9-NEXT: [[TMP2:%.*]] = load i32, i32* [[F_ADDR]], align 4 -// CHECK9-NEXT: store i32 [[TMP2]], i32* [[F_CASTED]], align 4 -// CHECK9-NEXT: [[TMP3:%.*]] = load i32, i32* [[F_CASTED]], align 4 -// CHECK9-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK9-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i32 [[TMP3]]) #[[ATTR2]] -// CHECK9-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK9: .omp.deinit: -// CHECK9-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK9-NEXT: br label [[DOTEXIT:%.*]] -// CHECK9: .exit: -// CHECK9-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38 +// CHECK2-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK2-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK2-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG10]] +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK2: .execute: +// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK2-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] +// CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK2: .omp.deinit: +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void // // -// CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__6 -// CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK9-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK9-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[K:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 -// CHECK9-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK9-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK9-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK9-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 -// CHECK9-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK9-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK9-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK9-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK9-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK9-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 -// CHECK9-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK9-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK9-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 -// CHECK9-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK9: cond.true: -// CHECK9-NEXT: br label [[COND_END:%.*]] -// CHECK9: cond.false: -// CHECK9-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK9-NEXT: br label [[COND_END]] -// CHECK9: cond.end: -// CHECK9-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] -// CHECK9-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK9-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK9-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK9: omp.inner.for.cond: -// CHECK9-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK9-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 -// CHECK9-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK9: omp.inner.for.body: -// CHECK9-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK9-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK9-NEXT: [[TMP9:%.*]] = load i32, i32* [[F_ADDR]], align 4 -// CHECK9-NEXT: store i32 [[TMP9]], i32* [[F_CASTED]], align 4 -// CHECK9-NEXT: [[TMP10:%.*]] = load i32, i32* [[F_CASTED]], align 4 -// CHECK9-NEXT: [[TMP11:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK9-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP7]] to i8* -// CHECK9-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 -// CHECK9-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK9-NEXT: [[TMP14:%.*]] = inttoptr i32 [[TMP8]] to i8* -// CHECK9-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 -// CHECK9-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK9-NEXT: [[TMP16:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* -// CHECK9-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 4 -// CHECK9-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK9-NEXT: [[TMP18:%.*]] = inttoptr i32 [[TMP10]] to i8* -// CHECK9-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 4 -// CHECK9-NEXT: [[TMP19:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK9-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x [10 x i32]]*, i32)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP19]], i32 4) -// CHECK9-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK9: omp.inner.for.inc: -// CHECK9-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK9-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] -// CHECK9-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 -// CHECK9-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK9-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK9-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK9-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK9-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK9-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK9-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] -// CHECK9-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK9-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK9-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP26]], 99 -// CHECK9-NEXT: br i1 [[CMP5]], label [[COND_TRUE6:%.*]], label [[COND_FALSE7:%.*]] -// CHECK9: cond.true6: -// CHECK9-NEXT: br label [[COND_END8:%.*]] -// CHECK9: cond.false7: -// CHECK9-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK9-NEXT: br label [[COND_END8]] -// CHECK9: cond.end8: -// CHECK9-NEXT: [[COND9:%.*]] = phi i32 [ 99, [[COND_TRUE6]] ], [ [[TMP27]], [[COND_FALSE7]] ] -// CHECK9-NEXT: store i32 [[COND9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK9-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK9-NEXT: store i32 [[TMP28]], i32* [[DOTOMP_IV]], align 4 -// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]] -// CHECK9: omp.inner.for.end: -// CHECK9-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK9: omp.loop.exit: -// CHECK9-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) -// CHECK9-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK9-NEXT: [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0 -// CHECK9-NEXT: br i1 [[TMP30]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK9: .omp.final.then: -// CHECK9-NEXT: store i32 10, i32* [[I]], align 4 -// CHECK9-NEXT: store i32 10, i32* [[J]], align 4 -// CHECK9-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK9: .omp.final.done: -// CHECK9-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK2-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK2-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK2-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK2: omp.precond.then: +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG10]] +// CHECK2-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK2-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK2: cond.true: +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: br label [[COND_END:%.*]] +// CHECK2: cond.false: +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END]] +// CHECK2: cond.end: +// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK2-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK2-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK2-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 +// CHECK2-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK2-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP19:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK2-NEXT: store i8* [[TMP19]], i8** [[TMP18]], align 4 +// CHECK2-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK2-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 +// CHECK2-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK2-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK2-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 +// CHECK2-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK2-NEXT: [[TMP25:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* +// CHECK2-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 +// CHECK2-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK2-NEXT: [[TMP28:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP27]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP28]], i32 4) +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP29]], [[TMP30]] +// CHECK2-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] +// CHECK2-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK2-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] +// CHECK2-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] +// CHECK2: cond.true10: +// CHECK2-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: br label [[COND_END12:%.*]] +// CHECK2: cond.false11: +// CHECK2-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END12]] +// CHECK2: cond.end12: +// CHECK2-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP37]], [[COND_TRUE10]] ], [ [[TMP38]], [[COND_FALSE11]] ] +// CHECK2-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP39]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP14:![0-9]+]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK2: omp.loop.exit: +// CHECK2-NEXT: [[TMP40:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]]) +// CHECK2-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP43:%.*]] = icmp ne i32 [[TMP42]], 0 +// CHECK2-NEXT: br i1 [[TMP43]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK2: .omp.final.then: +// CHECK2-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[SUB14:%.*]] = sub nsw i32 [[TMP44]], 0 +// CHECK2-NEXT: [[DIV15:%.*]] = sdiv i32 [[SUB14]], 1 +// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV15]], 1 +// CHECK2-NEXT: [[ADD16:%.*]] = add nsw i32 0, [[MUL]] +// CHECK2-NEXT: store i32 [[ADD16]], i32* [[I3]], align 4 +// CHECK2-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK2: .omp.final.done: +// CHECK2-NEXT: br label [[OMP_PRECOND_END]] +// CHECK2: omp.precond.end: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK2-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK2-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK2-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK2-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK2-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK2: omp.precond.then: +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK2-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] +// CHECK2-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK2-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i32 0, i32 [[TMP13]] +// CHECK2-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +// CHECK2-NEXT: [[CONV:%.*]] = sext i16 [[TMP14]] to i32 +// CHECK2-NEXT: [[ADD5:%.*]] = add nsw i32 [[CONV]], 1 +// CHECK2-NEXT: [[CONV6:%.*]] = trunc i32 [[ADD5]] to i16 +// CHECK2-NEXT: store i16 [[CONV6]], i16* [[ARRAYIDX]], align 2 +// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK2: omp.body.continue: +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] +// CHECK2-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP15:![0-9]+]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK2: omp.loop.exit: +// CHECK2-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) +// CHECK2-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 +// CHECK2-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK2: .omp.final.then: +// CHECK2-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP21]], 0 +// CHECK2-NEXT: [[DIV9:%.*]] = sdiv i32 [[SUB8]], 1 +// CHECK2-NEXT: [[MUL10:%.*]] = mul nsw i32 [[DIV9]], 1 +// CHECK2-NEXT: [[ADD11:%.*]] = add nsw i32 0, [[MUL10]] +// CHECK2-NEXT: store i32 [[ADD11]], i32* [[I3]], align 4 +// CHECK2-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK2: .omp.final.done: +// CHECK2-NEXT: br label [[OMP_PRECOND_END]] +// CHECK2: omp.precond.end: +// CHECK2-NEXT: ret void // // -// CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__7 -// CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK9-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK9-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[K:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK9-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK9-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK9-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK9-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK9-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 -// CHECK9-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK9-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 -// CHECK9-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK9-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK9-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 -// CHECK9-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 -// CHECK9-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK9-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK9-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK9-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 -// CHECK9-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK9-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK9-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK9: omp.inner.for.cond: -// CHECK9-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK9-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK9-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] -// CHECK9-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK9: omp.inner.for.body: -// CHECK9-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK9-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 -// CHECK9-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 -// CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK9-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK9-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK9-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK9-NEXT: [[DIV2:%.*]] = sdiv i32 [[TMP10]], 10 -// CHECK9-NEXT: [[MUL3:%.*]] = mul nsw i32 [[DIV2]], 10 -// CHECK9-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL3]] -// CHECK9-NEXT: [[MUL4:%.*]] = mul nsw i32 [[SUB]], 1 -// CHECK9-NEXT: [[ADD5:%.*]] = add nsw i32 0, [[MUL4]] -// CHECK9-NEXT: store i32 [[ADD5]], i32* [[J]], align 4 -// CHECK9-NEXT: store i32 10, i32* [[K]], align 4 -// CHECK9-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 -// CHECK9-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 -// CHECK9-NEXT: [[TMP13:%.*]] = load i32, i32* [[F_ADDR]], align 4 -// CHECK9-NEXT: [[MUL6:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] -// CHECK9-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP11]], [[MUL6]] -// CHECK9-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 -// CHECK9-NEXT: [[ADD8:%.*]] = add nsw i32 [[ADD7]], [[TMP14]] -// CHECK9-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 -// CHECK9-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP15]] -// CHECK9-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 -// CHECK9-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP16]] -// CHECK9-NEXT: store i32 [[ADD8]], i32* [[ARRAYIDX9]], align 4 -// CHECK9-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK9: omp.body.continue: -// CHECK9-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK9: omp.inner.for.inc: -// CHECK9-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK9-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK9-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] -// CHECK9-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 -// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP18:![0-9]+]] -// CHECK9: omp.inner.for.end: -// CHECK9-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK9: omp.loop.exit: -// CHECK9-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) -// CHECK9-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK9-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 -// CHECK9-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK9: .omp.final.then: -// CHECK9-NEXT: store i32 10, i32* [[I]], align 4 -// CHECK9-NEXT: store i32 10, i32* [[J]], align 4 -// CHECK9-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK9: .omp.final.done: -// CHECK9-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43 +// CHECK2-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG10]] +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK2: .execute: +// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK2-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] +// CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK2: .omp.deinit: +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void // // -// CHECK10-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38 -// CHECK10-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] { -// CHECK10-NEXT: entry: -// CHECK10-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK10-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK10-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK10-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK10-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 -// CHECK10-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK10-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK10-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK10-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK10: .execute: -// CHECK10-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) -// CHECK10-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK10-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 -// CHECK10-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK10-NEXT: [[TMP4:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK10-NEXT: store i32 [[TMP4]], i32* [[L_CASTED]], align 4 -// CHECK10-NEXT: [[TMP5:%.*]] = load i32, i32* [[L_CASTED]], align 4 -// CHECK10-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK10-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32 [[TMP5]]) #[[ATTR2:[0-9]+]] -// CHECK10-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK10: .omp.deinit: -// CHECK10-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK10-NEXT: br label [[DOTEXIT:%.*]] -// CHECK10: .exit: -// CHECK10-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK2-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG10]] +// CHECK2-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 +// CHECK2-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK2: cond.true: +// CHECK2-NEXT: br label [[COND_END:%.*]] +// CHECK2: cond.false: +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END]] +// CHECK2: cond.end: +// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK2-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 +// CHECK2-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK2-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK2-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK2-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK2-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK2-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK2-NEXT: [[TMP15:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP15]], i32 3) +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP17]] +// CHECK2-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] +// CHECK2-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK2-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP22]], 9 +// CHECK2-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] +// CHECK2: cond.true5: +// CHECK2-NEXT: br label [[COND_END7:%.*]] +// CHECK2: cond.false6: +// CHECK2-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END7]] +// CHECK2: cond.end7: +// CHECK2-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP23]], [[COND_FALSE6]] ] +// CHECK2-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP24]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK2: omp.loop.exit: +// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK2-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP26:%.*]] = icmp ne i32 [[TMP25]], 0 +// CHECK2-NEXT: br i1 [[TMP26]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK2: .omp.final.then: +// CHECK2-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK2-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK2: .omp.final.done: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__5 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK2-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK2-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK2-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK2-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 +// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 [[TMP9]] +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK2-NEXT: [[ADD1:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK2-NEXT: store i32 [[ADD1]], i32* [[ARRAYIDX]], align 4 +// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK2: omp.body.continue: +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] +// CHECK2-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK2: omp.loop.exit: +// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 +// CHECK2-NEXT: br i1 [[TMP14]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK2: .omp.final.then: +// CHECK2-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK2-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK2: .omp.final.done: +// CHECK2-NEXT: ret void // // -// CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { -// CHECK10-NEXT: entry: -// CHECK10-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK10-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK10-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK10-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[I4:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 -// CHECK10-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK10-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK10-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK10-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK10-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 -// CHECK10-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK10-NEXT: [[TMP1:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK10-NEXT: [[TMP2:%.*]] = load i32, i32* @"_openmp_static_kernel$size", align 4 -// CHECK10-NEXT: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP2]], i16 [[TMP1]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) -// CHECK10-NEXT: [[TMP3:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4 -// CHECK10-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP3]], i32 0 -// CHECK10-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to %struct._globalized_locals_ty* -// CHECK10-NEXT: [[L1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP5]], i32 0, i32 0 -// CHECK10-NEXT: [[TMP6:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK10-NEXT: store i32 [[TMP6]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK10-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK10-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP7]], 0 -// CHECK10-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK10-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK10-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK10-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK10-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK10-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP8]] -// CHECK10-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK10: omp.precond.then: -// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK10-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK10-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK10-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK10-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK10-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 -// CHECK10-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP11]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) -// CHECK10-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK10-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK10-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]] -// CHECK10-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK10: cond.true: -// CHECK10-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK10-NEXT: br label [[COND_END:%.*]] -// CHECK10: cond.false: -// CHECK10-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK10-NEXT: br label [[COND_END]] -// CHECK10: cond.end: -// CHECK10-NEXT: [[COND:%.*]] = phi i32 [ [[TMP14]], [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ] -// CHECK10-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK10-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK10-NEXT: store i32 [[TMP16]], i32* [[DOTOMP_IV]], align 4 -// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK10: omp.inner.for.cond: -// CHECK10-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK10-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK10-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], 1 -// CHECK10-NEXT: [[CMP6:%.*]] = icmp slt i32 [[TMP17]], [[ADD]] -// CHECK10-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK10: omp.inner.for.body: -// CHECK10-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK10-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK10-NEXT: [[TMP21:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK10-NEXT: store i32 [[TMP21]], i32* [[N_CASTED]], align 4 -// CHECK10-NEXT: [[TMP22:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK10-NEXT: [[TMP23:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK10-NEXT: store i32 [[TMP23]], i32* [[L_CASTED]], align 4 -// CHECK10-NEXT: [[TMP24:%.*]] = load i32, i32* [[L_CASTED]], align 4 -// CHECK10-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK10-NEXT: [[TMP26:%.*]] = inttoptr i32 [[TMP19]] to i8* -// CHECK10-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 -// CHECK10-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK10-NEXT: [[TMP28:%.*]] = inttoptr i32 [[TMP20]] to i8* -// CHECK10-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 -// CHECK10-NEXT: [[TMP29:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK10-NEXT: [[TMP30:%.*]] = inttoptr i32 [[TMP22]] to i8* -// CHECK10-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 4 -// CHECK10-NEXT: [[TMP31:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK10-NEXT: [[TMP32:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* -// CHECK10-NEXT: store i8* [[TMP32]], i8** [[TMP31]], align 4 -// CHECK10-NEXT: [[TMP33:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 -// CHECK10-NEXT: [[TMP34:%.*]] = inttoptr i32 [[TMP24]] to i8* -// CHECK10-NEXT: store i8* [[TMP34]], i8** [[TMP33]], align 4 -// CHECK10-NEXT: [[TMP35:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK10-NEXT: [[TMP36:%.*]] = load i32, i32* [[TMP35]], align 4 -// CHECK10-NEXT: [[TMP37:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK10-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP36]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP37]], i32 5) -// CHECK10-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK10: omp.inner.for.inc: -// CHECK10-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK10-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK10-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP38]], [[TMP39]] -// CHECK10-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK10-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK10-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK10-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP40]], [[TMP41]] -// CHECK10-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK10-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK10-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK10-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP42]], [[TMP43]] -// CHECK10-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK10-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK10-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK10-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP44]], [[TMP45]] -// CHECK10-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] -// CHECK10: cond.true11: -// CHECK10-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK10-NEXT: br label [[COND_END13:%.*]] -// CHECK10: cond.false12: -// CHECK10-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK10-NEXT: br label [[COND_END13]] -// CHECK10: cond.end13: -// CHECK10-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP46]], [[COND_TRUE11]] ], [ [[TMP47]], [[COND_FALSE12]] ] -// CHECK10-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK10-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK10-NEXT: store i32 [[TMP48]], i32* [[DOTOMP_IV]], align 4 -// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP10:![0-9]+]] -// CHECK10: omp.inner.for.end: -// CHECK10-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK10: omp.loop.exit: -// CHECK10-NEXT: [[TMP49:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK10-NEXT: [[TMP50:%.*]] = load i32, i32* [[TMP49]], align 4 -// CHECK10-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP50]]) -// CHECK10-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK10-NEXT: [[TMP52:%.*]] = icmp ne i32 [[TMP51]], 0 -// CHECK10-NEXT: br i1 [[TMP52]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK10: .omp.final.then: -// CHECK10-NEXT: [[TMP53:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK10-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP53]], 0 -// CHECK10-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1 -// CHECK10-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV16]], 1 -// CHECK10-NEXT: [[ADD17:%.*]] = add nsw i32 0, [[MUL]] -// CHECK10-NEXT: store i32 [[ADD17]], i32* [[I4]], align 4 -// CHECK10-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK10: .omp.final.done: -// CHECK10-NEXT: [[TMP54:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK10-NEXT: [[TMP55:%.*]] = icmp ne i32 [[TMP54]], 0 -// CHECK10-NEXT: br i1 [[TMP55]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK10: .omp.lastprivate.then: -// CHECK10-NEXT: [[TMP56:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK10-NEXT: store i32 [[TMP56]], i32* [[L_ADDR]], align 4 -// CHECK10-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK10: .omp.lastprivate.done: -// CHECK10-NEXT: br label [[OMP_PRECOND_END]] -// CHECK10: omp.precond.end: -// CHECK10-NEXT: [[TMP57:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK10-NEXT: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[TMP57]]) -// CHECK10-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l48 +// CHECK2-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK2-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG10]] +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK2: .execute: +// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP2]], i32* [[F_CASTED]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK2-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i32 [[TMP3]]) #[[ATTR2]] +// CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK2: .omp.deinit: +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__6 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK2-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK2-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG10]] +// CHECK2-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 +// CHECK2-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK2: cond.true: +// CHECK2-NEXT: br label [[COND_END:%.*]] +// CHECK2: cond.false: +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END]] +// CHECK2: cond.end: +// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK2-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 +// CHECK2-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP9]], i32* [[F_CASTED]], align 4 +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK2-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK2-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP14:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK2-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK2-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK2-NEXT: [[TMP16:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK2-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 4 +// CHECK2-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK2-NEXT: [[TMP18:%.*]] = inttoptr i32 [[TMP10]] to i8* +// CHECK2-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 4 +// CHECK2-NEXT: [[TMP19:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x [10 x i32]]*, i32)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP19]], i32 4) +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK2-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK2-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK2-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP26]], 99 +// CHECK2-NEXT: br i1 [[CMP5]], label [[COND_TRUE6:%.*]], label [[COND_FALSE7:%.*]] +// CHECK2: cond.true6: +// CHECK2-NEXT: br label [[COND_END8:%.*]] +// CHECK2: cond.false7: +// CHECK2-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END8]] +// CHECK2: cond.end8: +// CHECK2-NEXT: [[COND9:%.*]] = phi i32 [ 99, [[COND_TRUE6]] ], [ [[TMP27]], [[COND_FALSE7]] ] +// CHECK2-NEXT: store i32 [[COND9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP28]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP18:![0-9]+]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK2: omp.loop.exit: +// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK2-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0 +// CHECK2-NEXT: br i1 [[TMP30]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK2: .omp.final.then: +// CHECK2-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK2-NEXT: store i32 10, i32* [[J]], align 4 +// CHECK2-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK2: .omp.final.done: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__7 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK2-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK2-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK2-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK2-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 +// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK2-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[DIV2:%.*]] = sdiv i32 [[TMP10]], 10 +// CHECK2-NEXT: [[MUL3:%.*]] = mul nsw i32 [[DIV2]], 10 +// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL3]] +// CHECK2-NEXT: [[MUL4:%.*]] = mul nsw i32 [[SUB]], 1 +// CHECK2-NEXT: [[ADD5:%.*]] = add nsw i32 0, [[MUL4]] +// CHECK2-NEXT: store i32 [[ADD5]], i32* [[J]], align 4 +// CHECK2-NEXT: store i32 10, i32* [[K]], align 4 +// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 +// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK2-NEXT: [[MUL6:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] +// CHECK2-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP11]], [[MUL6]] +// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 +// CHECK2-NEXT: [[ADD8:%.*]] = add nsw i32 [[ADD7]], [[TMP14]] +// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 +// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP15]] +// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 +// CHECK2-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP16]] +// CHECK2-NEXT: store i32 [[ADD8]], i32* [[ARRAYIDX9]], align 4 +// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK2: omp.body.continue: +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK2-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK2: omp.loop.exit: +// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK2-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 +// CHECK2-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK2: .omp.final.then: +// CHECK2-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK2-NEXT: store i32 10, i32* [[J]], align 4 +// CHECK2-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK2: .omp.final.done: +// CHECK2-NEXT: ret void // // -// CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { -// CHECK10-NEXT: entry: -// CHECK10-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK10-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK10-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK10-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK10-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK10-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK10-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK10-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK10-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK10-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 -// CHECK10-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK10-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK10-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK10-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK10-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK10-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK10-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK10-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK10-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK10-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK10-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK10-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK10: omp.precond.then: -// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK10-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK10-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK10-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK10-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK10-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 -// CHECK10-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 -// CHECK10-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK10-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK10-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK10-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) -// CHECK10-NEXT: br label [[OMP_DISPATCH_COND:%.*]] -// CHECK10: omp.dispatch.cond: -// CHECK10-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK10-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK10-NEXT: [[CMP4:%.*]] = icmp ugt i32 [[TMP9]], [[TMP10]] -// CHECK10-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK10: cond.true: -// CHECK10-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK10-NEXT: br label [[COND_END:%.*]] -// CHECK10: cond.false: -// CHECK10-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK10-NEXT: br label [[COND_END]] -// CHECK10: cond.end: -// CHECK10-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] -// CHECK10-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 -// CHECK10-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK10-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 -// CHECK10-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK10-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK10-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] -// CHECK10-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] -// CHECK10: omp.dispatch.body: -// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK10: omp.inner.for.cond: -// CHECK10-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK10-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK10-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] -// CHECK10-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK10: omp.inner.for.body: -// CHECK10-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK10-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 -// CHECK10-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK10-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 -// CHECK10-NEXT: [[TMP19:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK10-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP19]] -// CHECK10-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 -// CHECK10-NEXT: [[TMP20:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK10-NEXT: store i32 [[TMP20]], i32* [[L_ADDR]], align 4 -// CHECK10-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK10: omp.body.continue: -// CHECK10-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK10: omp.inner.for.inc: -// CHECK10-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK10-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP21]], 1 -// CHECK10-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP12:![0-9]+]] -// CHECK10: omp.inner.for.end: -// CHECK10-NEXT: br label [[OMP_DISPATCH_INC:%.*]] -// CHECK10: omp.dispatch.inc: -// CHECK10-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK10-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK10-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK10-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_LB]], align 4 -// CHECK10-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK10-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK10-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] -// CHECK10-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_UB]], align 4 -// CHECK10-NEXT: br label [[OMP_DISPATCH_COND]] -// CHECK10: omp.dispatch.end: -// CHECK10-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK10-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 -// CHECK10-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) -// CHECK10-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK10-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 -// CHECK10-NEXT: br i1 [[TMP29]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK10: .omp.final.then: -// CHECK10-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK10-NEXT: [[SUB10:%.*]] = sub nsw i32 [[TMP30]], 0 -// CHECK10-NEXT: [[DIV11:%.*]] = sdiv i32 [[SUB10]], 1 -// CHECK10-NEXT: [[MUL12:%.*]] = mul nsw i32 [[DIV11]], 1 -// CHECK10-NEXT: [[ADD13:%.*]] = add nsw i32 0, [[MUL12]] -// CHECK10-NEXT: store i32 [[ADD13]], i32* [[I3]], align 4 -// CHECK10-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK10: .omp.final.done: -// CHECK10-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK10-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0 -// CHECK10-NEXT: br i1 [[TMP32]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK10: .omp.lastprivate.then: -// CHECK10-NEXT: [[TMP33:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK10-NEXT: store i32 [[TMP33]], i32* [[L_ADDR]], align 4 -// CHECK10-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK10: .omp.lastprivate.done: -// CHECK10-NEXT: br label [[OMP_PRECOND_END]] -// CHECK10: omp.precond.end: -// CHECK10-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l32 +// CHECK3-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK3-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG10:![0-9]+]] +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK3: .execute: +// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP4]], i32* [[L_CASTED]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK3-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32 [[TMP5]]) #[[ATTR2:[0-9]+]] +// CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK3: .omp.deinit: +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK3-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I4:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK3-NEXT: [[L1:%.*]] = call i8* @__kmpc_alloc_shared(i32 4) +// CHECK3-NEXT: [[L_ON_STACK:%.*]] = bitcast i8* [[L1]] to i32* +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK3-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK3-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK3: omp.precond.then: +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK3-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK3: cond.true: +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: br label [[COND_END:%.*]] +// CHECK3: cond.false: +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END]] +// CHECK3: cond.end: +// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK3-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK3-NEXT: [[CMP6:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK3-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 +// CHECK3-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP18]], i32* [[L_CASTED]], align 4 +// CHECK3-NEXT: [[TMP19:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK3-NEXT: [[TMP20:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK3-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 +// CHECK3-NEXT: [[TMP22:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK3-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 +// CHECK3-NEXT: [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK3-NEXT: [[TMP25:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK3-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 +// CHECK3-NEXT: [[TMP26:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK3-NEXT: [[TMP27:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK3-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 4 +// CHECK3-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK3-NEXT: [[TMP29:%.*]] = inttoptr i32 [[TMP19]] to i8* +// CHECK3-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 4 +// CHECK3-NEXT: [[TMP30:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP31:%.*]] = load i32, i32* [[TMP30]], align 4 +// CHECK3-NEXT: [[TMP32:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP31]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP32]], i32 5) +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK3-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] +// CHECK3-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP37]], [[TMP38]] +// CHECK3-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP39]], [[TMP40]] +// CHECK3-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK3: cond.true11: +// CHECK3-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: br label [[COND_END13:%.*]] +// CHECK3: cond.false12: +// CHECK3-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END13]] +// CHECK3: cond.end13: +// CHECK3-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP41]], [[COND_TRUE11]] ], [ [[TMP42]], [[COND_FALSE12]] ] +// CHECK3-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP43]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP11:![0-9]+]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK3: omp.loop.exit: +// CHECK3-NEXT: [[TMP44:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP45:%.*]] = load i32, i32* [[TMP44]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP45]]) +// CHECK3-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP47:%.*]] = icmp ne i32 [[TMP46]], 0 +// CHECK3-NEXT: br i1 [[TMP47]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK3: .omp.final.then: +// CHECK3-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP48]], 0 +// CHECK3-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1 +// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV16]], 1 +// CHECK3-NEXT: [[ADD17:%.*]] = add nsw i32 0, [[MUL]] +// CHECK3-NEXT: store i32 [[ADD17]], i32* [[I4]], align 4 +// CHECK3-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK3: .omp.final.done: +// CHECK3-NEXT: [[TMP49:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP50:%.*]] = icmp ne i32 [[TMP49]], 0 +// CHECK3-NEXT: br i1 [[TMP50]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK3: .omp.lastprivate.then: +// CHECK3-NEXT: [[TMP51:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP51]], i32* [[L_ADDR]], align 4 +// CHECK3-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK3: .omp.lastprivate.done: +// CHECK3-NEXT: br label [[OMP_PRECOND_END]] +// CHECK3: omp.precond.end: +// CHECK3-NEXT: call void @__kmpc_free_shared(i8* [[L1]]) +// CHECK3-NEXT: ret void // // -// CHECK10-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44 -// CHECK10-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK10-NEXT: entry: -// CHECK10-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 -// CHECK10-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK10-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK10-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK10-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK10-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK10-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK10-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK10: .execute: -// CHECK10-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK10-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK10-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 -// CHECK10-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK10-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK10-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] -// CHECK10-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK10: .omp.deinit: -// CHECK10-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK10-NEXT: br label [[DOTEXIT:%.*]] -// CHECK10: .exit: -// CHECK10-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK3-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK3-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK3-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK3: omp.precond.then: +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) +// CHECK3-NEXT: br label [[OMP_DISPATCH_COND:%.*]] +// CHECK3: omp.dispatch.cond: +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: [[CMP4:%.*]] = icmp ugt i32 [[TMP9]], [[TMP10]] +// CHECK3-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK3: cond.true: +// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: br label [[COND_END:%.*]] +// CHECK3: cond.false: +// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END]] +// CHECK3: cond.end: +// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] +// CHECK3-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] +// CHECK3-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] +// CHECK3: omp.dispatch.body: +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] +// CHECK3-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK3-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK3-NEXT: [[TMP19:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP19]] +// CHECK3-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 +// CHECK3-NEXT: [[TMP20:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK3-NEXT: store i32 [[TMP20]], i32* [[L_ADDR]], align 4 +// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK3: omp.body.continue: +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP21]], 1 +// CHECK3-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_DISPATCH_INC:%.*]] +// CHECK3: omp.dispatch.inc: +// CHECK3-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK3-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK3-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: br label [[OMP_DISPATCH_COND]] +// CHECK3: omp.dispatch.end: +// CHECK3-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) +// CHECK3-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 +// CHECK3-NEXT: br i1 [[TMP29]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK3: .omp.final.then: +// CHECK3-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[SUB10:%.*]] = sub nsw i32 [[TMP30]], 0 +// CHECK3-NEXT: [[DIV11:%.*]] = sdiv i32 [[SUB10]], 1 +// CHECK3-NEXT: [[MUL12:%.*]] = mul nsw i32 [[DIV11]], 1 +// CHECK3-NEXT: [[ADD13:%.*]] = add nsw i32 0, [[MUL12]] +// CHECK3-NEXT: store i32 [[ADD13]], i32* [[I3]], align 4 +// CHECK3-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK3: .omp.final.done: +// CHECK3-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0 +// CHECK3-NEXT: br i1 [[TMP32]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK3: .omp.lastprivate.then: +// CHECK3-NEXT: [[TMP33:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP33]], i32* [[L_ADDR]], align 4 +// CHECK3-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK3: .omp.lastprivate.done: +// CHECK3-NEXT: br label [[OMP_PRECOND_END]] +// CHECK3: omp.precond.end: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38 +// CHECK3-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG10]] +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK3: .execute: +// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK3-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] +// CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK3: .omp.deinit: +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void // // -// CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__2 -// CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK10-NEXT: entry: -// CHECK10-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK10-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK10-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 -// CHECK10-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 -// CHECK10-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK10-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK10-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK10-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK10-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK10-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK10-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK10-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK10-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK10-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK10-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK10-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK10-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK10-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK10-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK10-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK10: omp.precond.then: -// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK10-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK10-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK10-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK10-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK10-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK10-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 -// CHECK10-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK10-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK10-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK10-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] -// CHECK10-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK10: cond.true: -// CHECK10-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK10-NEXT: br label [[COND_END:%.*]] -// CHECK10: cond.false: -// CHECK10-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK10-NEXT: br label [[COND_END]] -// CHECK10: cond.end: -// CHECK10-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] -// CHECK10-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK10-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK10-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 -// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK10: omp.inner.for.cond: -// CHECK10-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK10-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK10-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 -// CHECK10-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] -// CHECK10-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK10: omp.inner.for.body: -// CHECK10-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK10-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK10-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK10-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 -// CHECK10-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK10-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK10-NEXT: [[TMP19:%.*]] = inttoptr i32 [[TMP14]] to i8* -// CHECK10-NEXT: store i8* [[TMP19]], i8** [[TMP18]], align 4 -// CHECK10-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK10-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP15]] to i8* -// CHECK10-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 -// CHECK10-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK10-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP17]] to i8* -// CHECK10-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 -// CHECK10-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK10-NEXT: [[TMP25:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* -// CHECK10-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 -// CHECK10-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK10-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 -// CHECK10-NEXT: [[TMP28:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK10-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP27]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP28]], i32 4) -// CHECK10-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK10: omp.inner.for.inc: -// CHECK10-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK10-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK10-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP29]], [[TMP30]] -// CHECK10-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 -// CHECK10-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK10-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK10-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] -// CHECK10-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK10-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK10-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK10-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] -// CHECK10-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK10-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK10-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK10-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] -// CHECK10-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] -// CHECK10: cond.true10: -// CHECK10-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK10-NEXT: br label [[COND_END12:%.*]] -// CHECK10: cond.false11: -// CHECK10-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK10-NEXT: br label [[COND_END12]] -// CHECK10: cond.end12: -// CHECK10-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP37]], [[COND_TRUE10]] ], [ [[TMP38]], [[COND_FALSE11]] ] -// CHECK10-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK10-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK10-NEXT: store i32 [[TMP39]], i32* [[DOTOMP_IV]], align 4 -// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]] -// CHECK10: omp.inner.for.end: -// CHECK10-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK10: omp.loop.exit: -// CHECK10-NEXT: [[TMP40:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK10-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 -// CHECK10-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]]) -// CHECK10-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK10-NEXT: [[TMP43:%.*]] = icmp ne i32 [[TMP42]], 0 -// CHECK10-NEXT: br i1 [[TMP43]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK10: .omp.final.then: -// CHECK10-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK10-NEXT: [[SUB14:%.*]] = sub nsw i32 [[TMP44]], 0 -// CHECK10-NEXT: [[DIV15:%.*]] = sdiv i32 [[SUB14]], 1 -// CHECK10-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV15]], 1 -// CHECK10-NEXT: [[ADD16:%.*]] = add nsw i32 0, [[MUL]] -// CHECK10-NEXT: store i32 [[ADD16]], i32* [[I3]], align 4 -// CHECK10-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK10: .omp.final.done: -// CHECK10-NEXT: br label [[OMP_PRECOND_END]] -// CHECK10: omp.precond.end: -// CHECK10-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK3-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK3-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK3: omp.precond.then: +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG10]] +// CHECK3-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK3-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK3: cond.true: +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: br label [[COND_END:%.*]] +// CHECK3: cond.false: +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END]] +// CHECK3: cond.end: +// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK3-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK3-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK3-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 +// CHECK3-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK3-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP19:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK3-NEXT: store i8* [[TMP19]], i8** [[TMP18]], align 4 +// CHECK3-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK3-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 +// CHECK3-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK3-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK3-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 +// CHECK3-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK3-NEXT: [[TMP25:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* +// CHECK3-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 +// CHECK3-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK3-NEXT: [[TMP28:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP27]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP28]], i32 4) +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP29]], [[TMP30]] +// CHECK3-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] +// CHECK3-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK3-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] +// CHECK3-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] +// CHECK3: cond.true10: +// CHECK3-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: br label [[COND_END12:%.*]] +// CHECK3: cond.false11: +// CHECK3-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END12]] +// CHECK3: cond.end12: +// CHECK3-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP37]], [[COND_TRUE10]] ], [ [[TMP38]], [[COND_FALSE11]] ] +// CHECK3-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP39]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP14:![0-9]+]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK3: omp.loop.exit: +// CHECK3-NEXT: [[TMP40:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]]) +// CHECK3-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP43:%.*]] = icmp ne i32 [[TMP42]], 0 +// CHECK3-NEXT: br i1 [[TMP43]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK3: .omp.final.then: +// CHECK3-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[SUB14:%.*]] = sub nsw i32 [[TMP44]], 0 +// CHECK3-NEXT: [[DIV15:%.*]] = sdiv i32 [[SUB14]], 1 +// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV15]], 1 +// CHECK3-NEXT: [[ADD16:%.*]] = add nsw i32 0, [[MUL]] +// CHECK3-NEXT: store i32 [[ADD16]], i32* [[I3]], align 4 +// CHECK3-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK3: .omp.final.done: +// CHECK3-NEXT: br label [[OMP_PRECOND_END]] +// CHECK3: omp.precond.end: +// CHECK3-NEXT: ret void // // -// CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__3 -// CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK10-NEXT: entry: -// CHECK10-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK10-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK10-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 -// CHECK10-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK10-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK10-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK10-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK10-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK10-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK10-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK10-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK10-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK10-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK10-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK10-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK10-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK10-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK10-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK10-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK10-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK10-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK10: omp.precond.then: -// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK10-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK10-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK10-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK10-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK10-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 -// CHECK10-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 -// CHECK10-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK10-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK10-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK10-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK10-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK10-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 -// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK10: omp.inner.for.cond: -// CHECK10-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK10-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK10-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] -// CHECK10-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK10: omp.inner.for.body: -// CHECK10-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK10-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 -// CHECK10-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK10-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 -// CHECK10-NEXT: [[TMP13:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK10-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i32 0, i32 [[TMP13]] -// CHECK10-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 -// CHECK10-NEXT: [[CONV:%.*]] = sext i16 [[TMP14]] to i32 -// CHECK10-NEXT: [[ADD5:%.*]] = add nsw i32 [[CONV]], 1 -// CHECK10-NEXT: [[CONV6:%.*]] = trunc i32 [[ADD5]] to i16 -// CHECK10-NEXT: store i16 [[CONV6]], i16* [[ARRAYIDX]], align 2 -// CHECK10-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK10: omp.body.continue: -// CHECK10-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK10: omp.inner.for.inc: -// CHECK10-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK10-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK10-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] -// CHECK10-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP14:![0-9]+]] -// CHECK10: omp.inner.for.end: -// CHECK10-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK10: omp.loop.exit: -// CHECK10-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK10-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 -// CHECK10-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) -// CHECK10-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK10-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 -// CHECK10-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK10: .omp.final.then: -// CHECK10-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK10-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP21]], 0 -// CHECK10-NEXT: [[DIV9:%.*]] = sdiv i32 [[SUB8]], 1 -// CHECK10-NEXT: [[MUL10:%.*]] = mul nsw i32 [[DIV9]], 1 -// CHECK10-NEXT: [[ADD11:%.*]] = add nsw i32 0, [[MUL10]] -// CHECK10-NEXT: store i32 [[ADD11]], i32* [[I3]], align 4 -// CHECK10-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK10: .omp.final.done: -// CHECK10-NEXT: br label [[OMP_PRECOND_END]] -// CHECK10: omp.precond.end: -// CHECK10-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK3-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK3-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK3: omp.precond.then: +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] +// CHECK3-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK3-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i32 0, i32 [[TMP13]] +// CHECK3-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +// CHECK3-NEXT: [[CONV:%.*]] = sext i16 [[TMP14]] to i32 +// CHECK3-NEXT: [[ADD5:%.*]] = add nsw i32 [[CONV]], 1 +// CHECK3-NEXT: [[CONV6:%.*]] = trunc i32 [[ADD5]] to i16 +// CHECK3-NEXT: store i16 [[CONV6]], i16* [[ARRAYIDX]], align 2 +// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK3: omp.body.continue: +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] +// CHECK3-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP15:![0-9]+]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK3: omp.loop.exit: +// CHECK3-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) +// CHECK3-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 +// CHECK3-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK3: .omp.final.then: +// CHECK3-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP21]], 0 +// CHECK3-NEXT: [[DIV9:%.*]] = sdiv i32 [[SUB8]], 1 +// CHECK3-NEXT: [[MUL10:%.*]] = mul nsw i32 [[DIV9]], 1 +// CHECK3-NEXT: [[ADD11:%.*]] = add nsw i32 0, [[MUL10]] +// CHECK3-NEXT: store i32 [[ADD11]], i32* [[I3]], align 4 +// CHECK3-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK3: .omp.final.done: +// CHECK3-NEXT: br label [[OMP_PRECOND_END]] +// CHECK3: omp.precond.end: +// CHECK3-NEXT: ret void // // -// CHECK10-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 -// CHECK10-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK10-NEXT: entry: -// CHECK10-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK10-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK10-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK10-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK10-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK10-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK10-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK10: .execute: -// CHECK10-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK10-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK10-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] -// CHECK10-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK10: .omp.deinit: -// CHECK10-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK10-NEXT: br label [[DOTEXIT:%.*]] -// CHECK10: .exit: -// CHECK10-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43 +// CHECK3-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK3-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG10]] +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK3: .execute: +// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK3-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] +// CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK3: .omp.deinit: +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void // // -// CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__4 -// CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK10-NEXT: entry: -// CHECK10-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK10-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK10-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK10-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 -// CHECK10-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK10-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK10-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK10-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK10-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK10-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK10-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK10-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK10-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 -// CHECK10-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK10-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK10-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 -// CHECK10-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK10: cond.true: -// CHECK10-NEXT: br label [[COND_END:%.*]] -// CHECK10: cond.false: -// CHECK10-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK10-NEXT: br label [[COND_END]] -// CHECK10: cond.end: -// CHECK10-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] -// CHECK10-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK10-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK10-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK10: omp.inner.for.cond: -// CHECK10-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK10-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 -// CHECK10-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK10: omp.inner.for.body: -// CHECK10-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK10-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK10-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK10-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP7]] to i8* -// CHECK10-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 -// CHECK10-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK10-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP8]] to i8* -// CHECK10-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 -// CHECK10-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK10-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* -// CHECK10-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 -// CHECK10-NEXT: [[TMP15:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK10-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP15]], i32 3) -// CHECK10-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK10: omp.inner.for.inc: -// CHECK10-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK10-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK10-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP17]] -// CHECK10-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 -// CHECK10-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK10-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK10-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] -// CHECK10-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK10-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK10-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK10-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] -// CHECK10-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK10-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK10-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP22]], 9 -// CHECK10-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] -// CHECK10: cond.true5: -// CHECK10-NEXT: br label [[COND_END7:%.*]] -// CHECK10: cond.false6: -// CHECK10-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK10-NEXT: br label [[COND_END7]] -// CHECK10: cond.end7: -// CHECK10-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP23]], [[COND_FALSE6]] ] -// CHECK10-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK10-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK10-NEXT: store i32 [[TMP24]], i32* [[DOTOMP_IV]], align 4 -// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP15:![0-9]+]] -// CHECK10: omp.inner.for.end: -// CHECK10-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK10: omp.loop.exit: -// CHECK10-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) -// CHECK10-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK10-NEXT: [[TMP26:%.*]] = icmp ne i32 [[TMP25]], 0 -// CHECK10-NEXT: br i1 [[TMP26]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK10: .omp.final.then: -// CHECK10-NEXT: store i32 10, i32* [[I]], align 4 -// CHECK10-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK10: .omp.final.done: -// CHECK10-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG10]] +// CHECK3-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 +// CHECK3-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK3: cond.true: +// CHECK3-NEXT: br label [[COND_END:%.*]] +// CHECK3: cond.false: +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END]] +// CHECK3: cond.end: +// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK3-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 +// CHECK3-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK3-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK3-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK3-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK3-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK3-NEXT: [[TMP15:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP15]], i32 3) +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP17]] +// CHECK3-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] +// CHECK3-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK3-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP22]], 9 +// CHECK3-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] +// CHECK3: cond.true5: +// CHECK3-NEXT: br label [[COND_END7:%.*]] +// CHECK3: cond.false6: +// CHECK3-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END7]] +// CHECK3: cond.end7: +// CHECK3-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP23]], [[COND_FALSE6]] ] +// CHECK3-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP24]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK3: omp.loop.exit: +// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK3-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP26:%.*]] = icmp ne i32 [[TMP25]], 0 +// CHECK3-NEXT: br i1 [[TMP26]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK3: .omp.final.then: +// CHECK3-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK3-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK3: .omp.final.done: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__5 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK3-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 +// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 [[TMP9]] +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK3-NEXT: [[ADD1:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK3-NEXT: store i32 [[ADD1]], i32* [[ARRAYIDX]], align 4 +// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK3: omp.body.continue: +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] +// CHECK3-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK3: omp.loop.exit: +// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 +// CHECK3-NEXT: br i1 [[TMP14]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK3: .omp.final.then: +// CHECK3-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK3-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK3: .omp.final.done: +// CHECK3-NEXT: ret void // // -// CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__5 -// CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK10-NEXT: entry: -// CHECK10-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK10-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK10-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK10-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK10-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK10-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK10-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK10-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK10-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK10-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 -// CHECK10-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK10-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK10-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 -// CHECK10-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 -// CHECK10-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK10-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK10-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 -// CHECK10-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK10-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK10-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK10: omp.inner.for.cond: -// CHECK10-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK10-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK10-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] -// CHECK10-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK10: omp.inner.for.body: -// CHECK10-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK10-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 -// CHECK10-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK10-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK10-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 -// CHECK10-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 [[TMP9]] -// CHECK10-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -// CHECK10-NEXT: [[ADD1:%.*]] = add nsw i32 [[TMP10]], 1 -// CHECK10-NEXT: store i32 [[ADD1]], i32* [[ARRAYIDX]], align 4 -// CHECK10-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK10: omp.body.continue: -// CHECK10-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK10: omp.inner.for.inc: -// CHECK10-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK10-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK10-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] -// CHECK10-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 -// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]] -// CHECK10: omp.inner.for.end: -// CHECK10-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK10: omp.loop.exit: -// CHECK10-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) -// CHECK10-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK10-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 -// CHECK10-NEXT: br i1 [[TMP14]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK10: .omp.final.then: -// CHECK10-NEXT: store i32 10, i32* [[I]], align 4 -// CHECK10-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK10: .omp.final.done: -// CHECK10-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l48 +// CHECK3-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK3-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK3-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG10]] +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK3: .execute: +// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP2]], i32* [[F_CASTED]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK3-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i32 [[TMP3]]) #[[ATTR2]] +// CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK3: .omp.deinit: +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__6 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK3-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG10]] +// CHECK3-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 +// CHECK3-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK3: cond.true: +// CHECK3-NEXT: br label [[COND_END:%.*]] +// CHECK3: cond.false: +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END]] +// CHECK3: cond.end: +// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK3-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 +// CHECK3-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP9]], i32* [[F_CASTED]], align 4 +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK3-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP14:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK3-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK3-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK3-NEXT: [[TMP16:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK3-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 4 +// CHECK3-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK3-NEXT: [[TMP18:%.*]] = inttoptr i32 [[TMP10]] to i8* +// CHECK3-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 4 +// CHECK3-NEXT: [[TMP19:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x [10 x i32]]*, i32)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP19]], i32 4) +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK3-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK3-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK3-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP26]], 99 +// CHECK3-NEXT: br i1 [[CMP5]], label [[COND_TRUE6:%.*]], label [[COND_FALSE7:%.*]] +// CHECK3: cond.true6: +// CHECK3-NEXT: br label [[COND_END8:%.*]] +// CHECK3: cond.false7: +// CHECK3-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END8]] +// CHECK3: cond.end8: +// CHECK3-NEXT: [[COND9:%.*]] = phi i32 [ 99, [[COND_TRUE6]] ], [ [[TMP27]], [[COND_FALSE7]] ] +// CHECK3-NEXT: store i32 [[COND9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP28]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP18:![0-9]+]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK3: omp.loop.exit: +// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK3-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0 +// CHECK3-NEXT: br i1 [[TMP30]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK3: .omp.final.then: +// CHECK3-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK3-NEXT: store i32 10, i32* [[J]], align 4 +// CHECK3-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK3: .omp.final.done: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__7 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK3-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 +// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK3-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[DIV2:%.*]] = sdiv i32 [[TMP10]], 10 +// CHECK3-NEXT: [[MUL3:%.*]] = mul nsw i32 [[DIV2]], 10 +// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL3]] +// CHECK3-NEXT: [[MUL4:%.*]] = mul nsw i32 [[SUB]], 1 +// CHECK3-NEXT: [[ADD5:%.*]] = add nsw i32 0, [[MUL4]] +// CHECK3-NEXT: store i32 [[ADD5]], i32* [[J]], align 4 +// CHECK3-NEXT: store i32 10, i32* [[K]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 +// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 +// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK3-NEXT: [[MUL6:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] +// CHECK3-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP11]], [[MUL6]] +// CHECK3-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 +// CHECK3-NEXT: [[ADD8:%.*]] = add nsw i32 [[ADD7]], [[TMP14]] +// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 +// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP15]] +// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 +// CHECK3-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP16]] +// CHECK3-NEXT: store i32 [[ADD8]], i32* [[ARRAYIDX9]], align 4 +// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK3: omp.body.continue: +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK3-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK3: omp.loop.exit: +// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK3-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 +// CHECK3-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK3: .omp.final.then: +// CHECK3-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK3-NEXT: store i32 10, i32* [[J]], align 4 +// CHECK3-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK3: .omp.final.done: +// CHECK3-NEXT: ret void // // -// CHECK10-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 -// CHECK10-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { -// CHECK10-NEXT: entry: -// CHECK10-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK10-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK10-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK10-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 -// CHECK10-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK10-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK10-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK10-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK10: .execute: -// CHECK10-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK10-NEXT: [[TMP2:%.*]] = load i32, i32* [[F_ADDR]], align 4 -// CHECK10-NEXT: store i32 [[TMP2]], i32* [[F_CASTED]], align 4 -// CHECK10-NEXT: [[TMP3:%.*]] = load i32, i32* [[F_CASTED]], align 4 -// CHECK10-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK10-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i32 [[TMP3]]) #[[ATTR2]] -// CHECK10-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK10: .omp.deinit: -// CHECK10-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK10-NEXT: br label [[DOTEXIT:%.*]] -// CHECK10: .exit: -// CHECK10-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l32 +// CHECK4-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK4-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[L_CASTED:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK4-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK4-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK4-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 +// CHECK4-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK4-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK4-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG10:![0-9]+]] +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK4: .execute: +// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK4-NEXT: [[CONV2:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK4-NEXT: store i32 [[TMP2]], i32* [[CONV2]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK4-NEXT: [[CONV3:%.*]] = bitcast i64* [[L_CASTED]] to i32* +// CHECK4-NEXT: store i32 [[TMP4]], i32* [[CONV3]], align 4 +// CHECK4-NEXT: [[TMP5:%.*]] = load i64, i64* [[L_CASTED]], align 8 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK4-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i32]* [[TMP0]], i64 [[TMP5]]) #[[ATTR2:[0-9]+]] +// CHECK4-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK4: .omp.deinit: +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK4-NEXT: br label [[DOTEXIT:%.*]] +// CHECK4: .exit: +// CHECK4-NEXT: ret void // // -// CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__6 -// CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { -// CHECK10-NEXT: entry: -// CHECK10-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK10-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK10-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK10-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[K:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 -// CHECK10-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK10-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK10-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK10-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 -// CHECK10-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK10-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK10-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK10-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK10-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK10-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 -// CHECK10-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK10-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK10-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 -// CHECK10-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK10: cond.true: -// CHECK10-NEXT: br label [[COND_END:%.*]] -// CHECK10: cond.false: -// CHECK10-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK10-NEXT: br label [[COND_END]] -// CHECK10: cond.end: -// CHECK10-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] -// CHECK10-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK10-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK10-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK10: omp.inner.for.cond: -// CHECK10-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK10-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 -// CHECK10-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK10: omp.inner.for.body: -// CHECK10-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK10-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK10-NEXT: [[TMP9:%.*]] = load i32, i32* [[F_ADDR]], align 4 -// CHECK10-NEXT: store i32 [[TMP9]], i32* [[F_CASTED]], align 4 -// CHECK10-NEXT: [[TMP10:%.*]] = load i32, i32* [[F_CASTED]], align 4 -// CHECK10-NEXT: [[TMP11:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK10-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP7]] to i8* -// CHECK10-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 -// CHECK10-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK10-NEXT: [[TMP14:%.*]] = inttoptr i32 [[TMP8]] to i8* -// CHECK10-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 -// CHECK10-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK10-NEXT: [[TMP16:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* -// CHECK10-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 4 -// CHECK10-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK10-NEXT: [[TMP18:%.*]] = inttoptr i32 [[TMP10]] to i8* -// CHECK10-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 4 -// CHECK10-NEXT: [[TMP19:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK10-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x [10 x i32]]*, i32)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP19]], i32 4) -// CHECK10-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK10: omp.inner.for.inc: -// CHECK10-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK10-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK10-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] -// CHECK10-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 -// CHECK10-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK10-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK10-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK10-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK10-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK10-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK10-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] -// CHECK10-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK10-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK10-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP26]], 99 -// CHECK10-NEXT: br i1 [[CMP5]], label [[COND_TRUE6:%.*]], label [[COND_FALSE7:%.*]] -// CHECK10: cond.true6: -// CHECK10-NEXT: br label [[COND_END8:%.*]] -// CHECK10: cond.false7: -// CHECK10-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK10-NEXT: br label [[COND_END8]] -// CHECK10: cond.end8: -// CHECK10-NEXT: [[COND9:%.*]] = phi i32 [ 99, [[COND_TRUE6]] ], [ [[TMP27]], [[COND_FALSE7]] ] -// CHECK10-NEXT: store i32 [[COND9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK10-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK10-NEXT: store i32 [[TMP28]], i32* [[DOTOMP_IV]], align 4 -// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]] -// CHECK10: omp.inner.for.end: -// CHECK10-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK10: omp.loop.exit: -// CHECK10-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) -// CHECK10-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK10-NEXT: [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0 -// CHECK10-NEXT: br i1 [[TMP30]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK10: .omp.final.then: -// CHECK10-NEXT: store i32 10, i32* [[I]], align 4 -// CHECK10-NEXT: store i32 10, i32* [[J]], align 4 -// CHECK10-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK10: .omp.final.done: -// CHECK10-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK4-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[L_CASTED:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 8 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK4-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK4-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK4-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 +// CHECK4-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK4-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK4-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* +// CHECK4-NEXT: [[L2:%.*]] = call i8* @__kmpc_alloc_shared(i64 4) +// CHECK4-NEXT: [[L_ON_STACK:%.*]] = bitcast i8* [[L2]] to i32* +// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK4-NEXT: [[SUB4:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK4-NEXT: store i32 [[SUB4]], i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK4: omp.precond.then: +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK4-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) +// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK4-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK4-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK4: cond.true: +// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK4-NEXT: br label [[COND_END:%.*]] +// CHECK4: cond.false: +// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: br label [[COND_END]] +// CHECK4: cond.end: +// CHECK4-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK4-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK4: omp.inner.for.cond: +// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK4-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK4-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK4: omp.inner.for.body: +// CHECK4-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 +// CHECK4-NEXT: [[TMP18:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK4-NEXT: [[CONV8:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK4-NEXT: store i32 [[TMP18]], i32* [[CONV8]], align 4 +// CHECK4-NEXT: [[TMP19:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK4-NEXT: [[TMP20:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK4-NEXT: [[CONV9:%.*]] = bitcast i64* [[L_CASTED]] to i32* +// CHECK4-NEXT: store i32 [[TMP20]], i32* [[CONV9]], align 4 +// CHECK4-NEXT: [[TMP21:%.*]] = load i64, i64* [[L_CASTED]], align 8 +// CHECK4-NEXT: [[TMP22:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK4-NEXT: [[TMP23:%.*]] = inttoptr i64 [[TMP15]] to i8* +// CHECK4-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 8 +// CHECK4-NEXT: [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK4-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP17]] to i8* +// CHECK4-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 8 +// CHECK4-NEXT: [[TMP26:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK4-NEXT: [[TMP27:%.*]] = inttoptr i64 [[TMP19]] to i8* +// CHECK4-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 8 +// CHECK4-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK4-NEXT: [[TMP29:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK4-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 8 +// CHECK4-NEXT: [[TMP30:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 4 +// CHECK4-NEXT: [[TMP31:%.*]] = inttoptr i64 [[TMP21]] to i8* +// CHECK4-NEXT: store i8* [[TMP31]], i8** [[TMP30]], align 8 +// CHECK4-NEXT: [[TMP32:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK4-NEXT: [[TMP33:%.*]] = load i32, i32* [[TMP32]], align 4 +// CHECK4-NEXT: [[TMP34:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP33]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i32]*, i64)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP34]], i64 5) +// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK4: omp.inner.for.inc: +// CHECK4-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] +// CHECK4-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP37]], [[TMP38]] +// CHECK4-NEXT: store i32 [[ADD11]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD12:%.*]] = add nsw i32 [[TMP39]], [[TMP40]] +// CHECK4-NEXT: store i32 [[ADD12]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK4-NEXT: [[CMP13:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]] +// CHECK4-NEXT: br i1 [[CMP13]], label [[COND_TRUE14:%.*]], label [[COND_FALSE15:%.*]] +// CHECK4: cond.true14: +// CHECK4-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK4-NEXT: br label [[COND_END16:%.*]] +// CHECK4: cond.false15: +// CHECK4-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: br label [[COND_END16]] +// CHECK4: cond.end16: +// CHECK4-NEXT: [[COND17:%.*]] = phi i32 [ [[TMP43]], [[COND_TRUE14]] ], [ [[TMP44]], [[COND_FALSE15]] ] +// CHECK4-NEXT: store i32 [[COND17]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP45]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP11:![0-9]+]] +// CHECK4: omp.inner.for.end: +// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK4: omp.loop.exit: +// CHECK4-NEXT: [[TMP46:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK4-NEXT: [[TMP47:%.*]] = load i32, i32* [[TMP46]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP47]]) +// CHECK4-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[TMP49:%.*]] = icmp ne i32 [[TMP48]], 0 +// CHECK4-NEXT: br i1 [[TMP49]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK4: .omp.final.then: +// CHECK4-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[SUB18:%.*]] = sub nsw i32 [[TMP50]], 0 +// CHECK4-NEXT: [[DIV19:%.*]] = sdiv i32 [[SUB18]], 1 +// CHECK4-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV19]], 1 +// CHECK4-NEXT: [[ADD20:%.*]] = add nsw i32 0, [[MUL]] +// CHECK4-NEXT: store i32 [[ADD20]], i32* [[I5]], align 4 +// CHECK4-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK4: .omp.final.done: +// CHECK4-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[TMP52:%.*]] = icmp ne i32 [[TMP51]], 0 +// CHECK4-NEXT: br i1 [[TMP52]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK4: .omp.lastprivate.then: +// CHECK4-NEXT: [[TMP53:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK4-NEXT: store i32 [[TMP53]], i32* [[CONV1]], align 8 +// CHECK4-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK4: .omp.lastprivate.done: +// CHECK4-NEXT: br label [[OMP_PRECOND_END]] +// CHECK4: omp.precond.end: +// CHECK4-NEXT: call void @__kmpc_free_shared(i8* [[L2]]) +// CHECK4-NEXT: ret void // // -// CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__7 -// CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { -// CHECK10-NEXT: entry: -// CHECK10-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK10-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK10-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK10-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[K:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK10-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK10-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK10-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK10-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK10-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK10-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 -// CHECK10-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK10-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 -// CHECK10-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK10-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK10-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 -// CHECK10-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 -// CHECK10-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK10-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK10-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK10-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 -// CHECK10-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK10-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK10-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK10: omp.inner.for.cond: -// CHECK10-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK10-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK10-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] -// CHECK10-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK10: omp.inner.for.body: -// CHECK10-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK10-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 -// CHECK10-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 -// CHECK10-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK10-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK10-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK10-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK10-NEXT: [[DIV2:%.*]] = sdiv i32 [[TMP10]], 10 -// CHECK10-NEXT: [[MUL3:%.*]] = mul nsw i32 [[DIV2]], 10 -// CHECK10-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL3]] -// CHECK10-NEXT: [[MUL4:%.*]] = mul nsw i32 [[SUB]], 1 -// CHECK10-NEXT: [[ADD5:%.*]] = add nsw i32 0, [[MUL4]] -// CHECK10-NEXT: store i32 [[ADD5]], i32* [[J]], align 4 -// CHECK10-NEXT: store i32 10, i32* [[K]], align 4 -// CHECK10-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 -// CHECK10-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 -// CHECK10-NEXT: [[TMP13:%.*]] = load i32, i32* [[F_ADDR]], align 4 -// CHECK10-NEXT: [[MUL6:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] -// CHECK10-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP11]], [[MUL6]] -// CHECK10-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 -// CHECK10-NEXT: [[ADD8:%.*]] = add nsw i32 [[ADD7]], [[TMP14]] -// CHECK10-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 -// CHECK10-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP15]] -// CHECK10-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 -// CHECK10-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP16]] -// CHECK10-NEXT: store i32 [[ADD8]], i32* [[ARRAYIDX9]], align 4 -// CHECK10-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK10: omp.body.continue: -// CHECK10-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK10: omp.inner.for.inc: -// CHECK10-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK10-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK10-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] -// CHECK10-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 -// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP18:![0-9]+]] -// CHECK10: omp.inner.for.end: -// CHECK10-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK10: omp.loop.exit: -// CHECK10-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) -// CHECK10-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK10-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 -// CHECK10-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK10: .omp.final.then: -// CHECK10-NEXT: store i32 10, i32* [[I]], align 4 -// CHECK10-NEXT: store i32 10, i32* [[J]], align 4 -// CHECK10-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK10: .omp.final.done: -// CHECK10-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK4-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I6:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK4-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK4-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK4-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK4-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK4-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 +// CHECK4-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK4-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK4-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* +// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK4-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK4-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK4: omp.precond.then: +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK4-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP5]] to i32 +// CHECK4-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK4-NEXT: [[CONV5:%.*]] = trunc i64 [[TMP6]] to i32 +// CHECK4-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: store i32 [[CONV5]], i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) +// CHECK4-NEXT: br label [[OMP_DISPATCH_COND:%.*]] +// CHECK4: omp.dispatch.cond: +// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: [[CONV7:%.*]] = sext i32 [[TMP9]] to i64 +// CHECK4-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK4-NEXT: [[CMP8:%.*]] = icmp ugt i64 [[CONV7]], [[TMP10]] +// CHECK4-NEXT: br i1 [[CMP8]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK4: cond.true: +// CHECK4-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK4-NEXT: br label [[COND_END:%.*]] +// CHECK4: cond.false: +// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: [[CONV9:%.*]] = sext i32 [[TMP12]] to i64 +// CHECK4-NEXT: br label [[COND_END]] +// CHECK4: cond.end: +// CHECK4-NEXT: [[COND:%.*]] = phi i64 [ [[TMP11]], [[COND_TRUE]] ], [ [[CONV9]], [[COND_FALSE]] ] +// CHECK4-NEXT: [[CONV10:%.*]] = trunc i64 [[COND]] to i32 +// CHECK4-NEXT: store i32 [[CONV10]], i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: [[CMP11:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] +// CHECK4-NEXT: br i1 [[CMP11]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] +// CHECK4: omp.dispatch.body: +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK4: omp.inner.for.cond: +// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: [[CMP12:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] +// CHECK4-NEXT: br i1 [[CMP12]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK4: omp.inner.for.body: +// CHECK4-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK4-NEXT: store i32 [[ADD]], i32* [[I6]], align 4 +// CHECK4-NEXT: [[TMP19:%.*]] = load i32, i32* [[I6]], align 4 +// CHECK4-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64 +// CHECK4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK4-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 +// CHECK4-NEXT: [[TMP20:%.*]] = load i32, i32* [[I6]], align 4 +// CHECK4-NEXT: store i32 [[TMP20]], i32* [[CONV1]], align 8 +// CHECK4-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK4: omp.body.continue: +// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK4: omp.inner.for.inc: +// CHECK4-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[ADD13:%.*]] = add nsw i32 [[TMP21]], 1 +// CHECK4-NEXT: store i32 [[ADD13]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]] +// CHECK4: omp.inner.for.end: +// CHECK4-NEXT: br label [[OMP_DISPATCH_INC:%.*]] +// CHECK4: omp.dispatch.inc: +// CHECK4-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK4-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD15:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK4-NEXT: store i32 [[ADD15]], i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: br label [[OMP_DISPATCH_COND]] +// CHECK4: omp.dispatch.end: +// CHECK4-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK4-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) +// CHECK4-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 +// CHECK4-NEXT: br i1 [[TMP29]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK4: .omp.final.then: +// CHECK4-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[SUB16:%.*]] = sub nsw i32 [[TMP30]], 0 +// CHECK4-NEXT: [[DIV17:%.*]] = sdiv i32 [[SUB16]], 1 +// CHECK4-NEXT: [[MUL18:%.*]] = mul nsw i32 [[DIV17]], 1 +// CHECK4-NEXT: [[ADD19:%.*]] = add nsw i32 0, [[MUL18]] +// CHECK4-NEXT: store i32 [[ADD19]], i32* [[I6]], align 4 +// CHECK4-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK4: .omp.final.done: +// CHECK4-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0 +// CHECK4-NEXT: br i1 [[TMP32]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK4: .omp.lastprivate.then: +// CHECK4-NEXT: [[TMP33:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK4-NEXT: store i32 [[TMP33]], i32* [[CONV1]], align 8 +// CHECK4-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK4: .omp.lastprivate.done: +// CHECK4-NEXT: br label [[OMP_PRECOND_END]] +// CHECK4: omp.precond.end: +// CHECK4-NEXT: ret void // // -// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38 -// CHECK11-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK11-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK11-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK11-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK11-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 -// CHECK11-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK11-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK11-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK11-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK11: .execute: -// CHECK11-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) -// CHECK11-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK11-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 -// CHECK11-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK11-NEXT: [[TMP4:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK11-NEXT: store i32 [[TMP4]], i32* [[L_CASTED]], align 4 -// CHECK11-NEXT: [[TMP5:%.*]] = load i32, i32* [[L_CASTED]], align 4 -// CHECK11-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK11-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32 [[TMP5]]) #[[ATTR2:[0-9]+]] -// CHECK11-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK11: .omp.deinit: -// CHECK11-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK11-NEXT: br label [[DOTEXIT:%.*]] -// CHECK11: .exit: -// CHECK11-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38 +// CHECK4-SAME: (i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 +// CHECK4-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK4-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK4-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK4-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK4-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG10]] +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK4: .execute: +// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK4-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK4-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK4-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] +// CHECK4-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK4: .omp.deinit: +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK4-NEXT: br label [[DOTEXIT:%.*]] +// CHECK4: .exit: +// CHECK4-NEXT: ret void // // -// CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK11-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK11-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[I4:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 -// CHECK11-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK11-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK11-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK11-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK11-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 -// CHECK11-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK11-NEXT: [[TMP1:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 4, i16 1) -// CHECK11-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to %struct._globalized_locals_ty* -// CHECK11-NEXT: [[L1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP2]], i32 0, i32 0 -// CHECK11-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK11-NEXT: store i32 [[TMP3]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK11-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK11-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP4]], 0 -// CHECK11-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK11-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK11-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK11-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK11-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK11-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] -// CHECK11-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK11: omp.precond.then: -// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK11-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK11-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK11-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK11-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK11-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP8]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) -// CHECK11-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK11-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]] -// CHECK11-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK11: cond.true: -// CHECK11-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK11-NEXT: br label [[COND_END:%.*]] -// CHECK11: cond.false: -// CHECK11-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: br label [[COND_END]] -// CHECK11: cond.end: -// CHECK11-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] -// CHECK11-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK11-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK11: omp.inner.for.cond: -// CHECK11-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP15]], 1 -// CHECK11-NEXT: [[CMP6:%.*]] = icmp slt i32 [[TMP14]], [[ADD]] -// CHECK11-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK11: omp.inner.for.body: -// CHECK11-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK11-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: [[TMP18:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK11-NEXT: store i32 [[TMP18]], i32* [[N_CASTED]], align 4 -// CHECK11-NEXT: [[TMP19:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK11-NEXT: [[TMP20:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK11-NEXT: store i32 [[TMP20]], i32* [[L_CASTED]], align 4 -// CHECK11-NEXT: [[TMP21:%.*]] = load i32, i32* [[L_CASTED]], align 4 -// CHECK11-NEXT: [[TMP22:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK11-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP16]] to i8* -// CHECK11-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 -// CHECK11-NEXT: [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK11-NEXT: [[TMP25:%.*]] = inttoptr i32 [[TMP17]] to i8* -// CHECK11-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 -// CHECK11-NEXT: [[TMP26:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK11-NEXT: [[TMP27:%.*]] = inttoptr i32 [[TMP19]] to i8* -// CHECK11-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 4 -// CHECK11-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK11-NEXT: [[TMP29:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* -// CHECK11-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 4 -// CHECK11-NEXT: [[TMP30:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 -// CHECK11-NEXT: [[TMP31:%.*]] = inttoptr i32 [[TMP21]] to i8* -// CHECK11-NEXT: store i8* [[TMP31]], i8** [[TMP30]], align 4 -// CHECK11-NEXT: [[TMP32:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK11-NEXT: [[TMP33:%.*]] = load i32, i32* [[TMP32]], align 4 -// CHECK11-NEXT: [[TMP34:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK11-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP33]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP34]], i32 5) -// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK11: omp.inner.for.inc: -// CHECK11-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK11-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] -// CHECK11-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK11-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK11-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP37]], [[TMP38]] -// CHECK11-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK11-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK11-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP39]], [[TMP40]] -// CHECK11-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK11-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]] -// CHECK11-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] -// CHECK11: cond.true11: -// CHECK11-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK11-NEXT: br label [[COND_END13:%.*]] -// CHECK11: cond.false12: -// CHECK11-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: br label [[COND_END13]] -// CHECK11: cond.end13: -// CHECK11-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP43]], [[COND_TRUE11]] ], [ [[TMP44]], [[COND_FALSE12]] ] -// CHECK11-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK11-NEXT: store i32 [[TMP45]], i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP10:![0-9]+]] -// CHECK11: omp.inner.for.end: -// CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK11: omp.loop.exit: -// CHECK11-NEXT: [[TMP46:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK11-NEXT: [[TMP47:%.*]] = load i32, i32* [[TMP46]], align 4 -// CHECK11-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP47]]) -// CHECK11-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK11-NEXT: [[TMP49:%.*]] = icmp ne i32 [[TMP48]], 0 -// CHECK11-NEXT: br i1 [[TMP49]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK11: .omp.final.then: -// CHECK11-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK11-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP50]], 0 -// CHECK11-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1 -// CHECK11-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV16]], 1 -// CHECK11-NEXT: [[ADD17:%.*]] = add nsw i32 0, [[MUL]] -// CHECK11-NEXT: store i32 [[ADD17]], i32* [[I4]], align 4 -// CHECK11-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK11: .omp.final.done: -// CHECK11-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK11-NEXT: [[TMP52:%.*]] = icmp ne i32 [[TMP51]], 0 -// CHECK11-NEXT: br i1 [[TMP52]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK11: .omp.lastprivate.then: -// CHECK11-NEXT: [[TMP53:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK11-NEXT: store i32 [[TMP53]], i32* [[L_ADDR]], align 4 -// CHECK11-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK11: .omp.lastprivate.done: -// CHECK11-NEXT: br label [[OMP_PRECOND_END]] -// CHECK11: omp.precond.end: -// CHECK11-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP1]]) -// CHECK11-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 +// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK4-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK4-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK4-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK4-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK4-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK4-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK4: omp.precond.then: +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG10]] +// CHECK4-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK4-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK4: cond.true: +// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: br label [[COND_END:%.*]] +// CHECK4: cond.false: +// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: br label [[COND_END]] +// CHECK4: cond.end: +// CHECK4-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK4-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK4: omp.inner.for.cond: +// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK4-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK4-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK4: omp.inner.for.body: +// CHECK4-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 +// CHECK4-NEXT: [[TMP18:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK4-NEXT: [[CONV6:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK4-NEXT: store i32 [[TMP18]], i32* [[CONV6]], align 4 +// CHECK4-NEXT: [[TMP19:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK4-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK4-NEXT: [[TMP21:%.*]] = inttoptr i64 [[TMP15]] to i8* +// CHECK4-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 8 +// CHECK4-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK4-NEXT: [[TMP23:%.*]] = inttoptr i64 [[TMP17]] to i8* +// CHECK4-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 8 +// CHECK4-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK4-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP19]] to i8* +// CHECK4-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 8 +// CHECK4-NEXT: [[TMP26:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK4-NEXT: [[TMP27:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* +// CHECK4-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 8 +// CHECK4-NEXT: [[TMP28:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK4-NEXT: [[TMP29:%.*]] = load i32, i32* [[TMP28]], align 4 +// CHECK4-NEXT: [[TMP30:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP29]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP30]], i64 4) +// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK4: omp.inner.for.inc: +// CHECK4-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] +// CHECK4-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK4-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] +// CHECK4-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP37]], [[TMP38]] +// CHECK4-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK4: cond.true11: +// CHECK4-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: br label [[COND_END13:%.*]] +// CHECK4: cond.false12: +// CHECK4-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: br label [[COND_END13]] +// CHECK4: cond.end13: +// CHECK4-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP39]], [[COND_TRUE11]] ], [ [[TMP40]], [[COND_FALSE12]] ] +// CHECK4-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP41]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP14:![0-9]+]] +// CHECK4: omp.inner.for.end: +// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK4: omp.loop.exit: +// CHECK4-NEXT: [[TMP42:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK4-NEXT: [[TMP43:%.*]] = load i32, i32* [[TMP42]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP43]]) +// CHECK4-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[TMP45:%.*]] = icmp ne i32 [[TMP44]], 0 +// CHECK4-NEXT: br i1 [[TMP45]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK4: .omp.final.then: +// CHECK4-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP46]], 0 +// CHECK4-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1 +// CHECK4-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV16]], 1 +// CHECK4-NEXT: [[ADD17:%.*]] = add nsw i32 0, [[MUL]] +// CHECK4-NEXT: store i32 [[ADD17]], i32* [[I3]], align 4 +// CHECK4-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK4: .omp.final.done: +// CHECK4-NEXT: br label [[OMP_PRECOND_END]] +// CHECK4: omp.precond.end: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 +// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK4-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK4-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK4-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK4-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK4-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK4-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK4-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK4-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK4: omp.precond.then: +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK4-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP5]] to i32 +// CHECK4-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK4-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP6]] to i32 +// CHECK4-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK4: omp.inner.for.cond: +// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[CONV6:%.*]] = sext i32 [[TMP10]] to i64 +// CHECK4-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK4-NEXT: [[CMP7:%.*]] = icmp ule i64 [[CONV6]], [[TMP11]] +// CHECK4-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK4: omp.inner.for.body: +// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK4-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 +// CHECK4-NEXT: [[TMP13:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK4-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP13]] to i64 +// CHECK4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK4-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +// CHECK4-NEXT: [[CONV8:%.*]] = sext i16 [[TMP14]] to i32 +// CHECK4-NEXT: [[ADD9:%.*]] = add nsw i32 [[CONV8]], 1 +// CHECK4-NEXT: [[CONV10:%.*]] = trunc i32 [[ADD9]] to i16 +// CHECK4-NEXT: store i16 [[CONV10]], i16* [[ARRAYIDX]], align 2 +// CHECK4-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK4: omp.body.continue: +// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK4: omp.inner.for.inc: +// CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] +// CHECK4-NEXT: store i32 [[ADD11]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP15:![0-9]+]] +// CHECK4: omp.inner.for.end: +// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK4: omp.loop.exit: +// CHECK4-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK4-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) +// CHECK4-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 +// CHECK4-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK4: .omp.final.then: +// CHECK4-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[SUB12:%.*]] = sub nsw i32 [[TMP21]], 0 +// CHECK4-NEXT: [[DIV13:%.*]] = sdiv i32 [[SUB12]], 1 +// CHECK4-NEXT: [[MUL14:%.*]] = mul nsw i32 [[DIV13]], 1 +// CHECK4-NEXT: [[ADD15:%.*]] = add nsw i32 0, [[MUL14]] +// CHECK4-NEXT: store i32 [[ADD15]], i32* [[I5]], align 4 +// CHECK4-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK4: .omp.final.done: +// CHECK4-NEXT: br label [[OMP_PRECOND_END]] +// CHECK4: omp.precond.end: +// CHECK4-NEXT: ret void // // -// CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK11-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK11-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK11-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK11-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK11-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK11-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK11-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK11-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 -// CHECK11-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK11-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK11-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK11-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK11-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK11-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK11-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK11-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK11-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK11-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK11-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK11-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK11: omp.precond.then: -// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK11-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK11-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK11-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK11-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK11-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 -// CHECK11-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 -// CHECK11-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK11-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK11-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK11-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) -// CHECK11-NEXT: br label [[OMP_DISPATCH_COND:%.*]] -// CHECK11: omp.dispatch.cond: -// CHECK11-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK11-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK11-NEXT: [[CMP4:%.*]] = icmp ugt i32 [[TMP9]], [[TMP10]] -// CHECK11-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK11: cond.true: -// CHECK11-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK11-NEXT: br label [[COND_END:%.*]] -// CHECK11: cond.false: -// CHECK11-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK11-NEXT: br label [[COND_END]] -// CHECK11: cond.end: -// CHECK11-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] -// CHECK11-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 -// CHECK11-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK11-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK11-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] -// CHECK11-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] -// CHECK11: omp.dispatch.body: -// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK11: omp.inner.for.cond: -// CHECK11-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK11-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] -// CHECK11-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK11: omp.inner.for.body: -// CHECK11-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 -// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK11-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 -// CHECK11-NEXT: [[TMP19:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK11-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP19]] -// CHECK11-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 -// CHECK11-NEXT: [[TMP20:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK11-NEXT: store i32 [[TMP20]], i32* [[L_ADDR]], align 4 -// CHECK11-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK11: omp.body.continue: -// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK11: omp.inner.for.inc: -// CHECK11-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP21]], 1 -// CHECK11-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP12:![0-9]+]] -// CHECK11: omp.inner.for.end: -// CHECK11-NEXT: br label [[OMP_DISPATCH_INC:%.*]] -// CHECK11: omp.dispatch.inc: -// CHECK11-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK11-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK11-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK11-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_LB]], align 4 -// CHECK11-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK11-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK11-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] -// CHECK11-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_UB]], align 4 -// CHECK11-NEXT: br label [[OMP_DISPATCH_COND]] -// CHECK11: omp.dispatch.end: -// CHECK11-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK11-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 -// CHECK11-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) -// CHECK11-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK11-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 -// CHECK11-NEXT: br i1 [[TMP29]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK11: .omp.final.then: -// CHECK11-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK11-NEXT: [[SUB10:%.*]] = sub nsw i32 [[TMP30]], 0 -// CHECK11-NEXT: [[DIV11:%.*]] = sdiv i32 [[SUB10]], 1 -// CHECK11-NEXT: [[MUL12:%.*]] = mul nsw i32 [[DIV11]], 1 -// CHECK11-NEXT: [[ADD13:%.*]] = add nsw i32 0, [[MUL12]] -// CHECK11-NEXT: store i32 [[ADD13]], i32* [[I3]], align 4 -// CHECK11-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK11: .omp.final.done: -// CHECK11-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK11-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0 -// CHECK11-NEXT: br i1 [[TMP32]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK11: .omp.lastprivate.then: -// CHECK11-NEXT: [[TMP33:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK11-NEXT: store i32 [[TMP33]], i32* [[L_ADDR]], align 4 -// CHECK11-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK11: .omp.lastprivate.done: -// CHECK11-NEXT: br label [[OMP_PRECOND_END]] -// CHECK11: omp.precond.end: -// CHECK11-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43 +// CHECK4-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK4-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG10]] +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK4: .execute: +// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK4-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] +// CHECK4-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK4: .omp.deinit: +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK4-NEXT: br label [[DOTEXIT:%.*]] +// CHECK4: .exit: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 8 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK4-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG10]] +// CHECK4-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 +// CHECK4-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK4: cond.true: +// CHECK4-NEXT: br label [[COND_END:%.*]] +// CHECK4: cond.false: +// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: br label [[COND_END]] +// CHECK4: cond.end: +// CHECK4-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK4-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK4: omp.inner.for.cond: +// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 +// CHECK4-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK4: omp.inner.for.body: +// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK4-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP8]] to i8* +// CHECK4-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 8 +// CHECK4-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK4-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP10]] to i8* +// CHECK4-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 +// CHECK4-NEXT: [[TMP15:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK4-NEXT: [[TMP16:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK4-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 +// CHECK4-NEXT: [[TMP17:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP17]], i64 3) +// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK4: omp.inner.for.inc: +// CHECK4-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] +// CHECK4-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK4-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK4-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP24]], 9 +// CHECK4-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] +// CHECK4: cond.true5: +// CHECK4-NEXT: br label [[COND_END7:%.*]] +// CHECK4: cond.false6: +// CHECK4-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: br label [[COND_END7]] +// CHECK4: cond.end7: +// CHECK4-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP25]], [[COND_FALSE6]] ] +// CHECK4-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP26]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]] +// CHECK4: omp.inner.for.end: +// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK4: omp.loop.exit: +// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK4-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[TMP28:%.*]] = icmp ne i32 [[TMP27]], 0 +// CHECK4-NEXT: br i1 [[TMP28]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK4: .omp.final.then: +// CHECK4-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK4-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK4: .omp.final.done: +// CHECK4-NEXT: ret void // // -// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44 -// CHECK11-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 -// CHECK11-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK11-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK11-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK11-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK11-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK11-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK11-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK11: .execute: -// CHECK11-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK11-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK11-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 -// CHECK11-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK11-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK11-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] -// CHECK11-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK11: .omp.deinit: -// CHECK11-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK11-NEXT: br label [[DOTEXIT:%.*]] -// CHECK11: .exit: -// CHECK11-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__5 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK4-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK4-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK4-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK4-NEXT: [[CONV:%.*]] = trunc i64 [[TMP1]] to i32 +// CHECK4-NEXT: [[TMP2:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK4-NEXT: [[CONV1:%.*]] = trunc i64 [[TMP2]] to i32 +// CHECK4-NEXT: store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: store i32 [[CONV1]], i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK4: omp.inner.for.cond: +// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[CONV2:%.*]] = sext i32 [[TMP6]] to i64 +// CHECK4-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK4-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV2]], [[TMP7]] +// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK4: omp.inner.for.body: +// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 +// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK4-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 +// CHECK4-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP9]] to i64 +// CHECK4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK4-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK4-NEXT: store i32 [[ADD3]], i32* [[ARRAYIDX]], align 4 +// CHECK4-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK4: omp.body.continue: +// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK4: omp.inner.for.inc: +// CHECK4-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] +// CHECK4-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]] +// CHECK4: omp.inner.for.end: +// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK4: omp.loop.exit: +// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK4-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 +// CHECK4-NEXT: br i1 [[TMP14]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK4: .omp.final.then: +// CHECK4-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK4-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK4: .omp.final.done: +// CHECK4-NEXT: ret void // // -// CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__2 -// CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK11-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 -// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 -// CHECK11-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK11-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK11-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK11-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK11-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK11-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK11-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK11-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK11-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK11-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK11-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK11-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK11-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK11-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK11-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK11-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK11: omp.precond.then: -// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK11-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK11-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK11-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK11-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK11-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 -// CHECK11-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK11-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK11-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] -// CHECK11-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK11: cond.true: -// CHECK11-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK11-NEXT: br label [[COND_END:%.*]] -// CHECK11: cond.false: -// CHECK11-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: br label [[COND_END]] -// CHECK11: cond.end: -// CHECK11-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] -// CHECK11-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK11-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK11: omp.inner.for.cond: -// CHECK11-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 -// CHECK11-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] -// CHECK11-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK11: omp.inner.for.body: -// CHECK11-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK11-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK11-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 -// CHECK11-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK11-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK11-NEXT: [[TMP19:%.*]] = inttoptr i32 [[TMP14]] to i8* -// CHECK11-NEXT: store i8* [[TMP19]], i8** [[TMP18]], align 4 -// CHECK11-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK11-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP15]] to i8* -// CHECK11-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 -// CHECK11-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK11-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP17]] to i8* -// CHECK11-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 -// CHECK11-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK11-NEXT: [[TMP25:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* -// CHECK11-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 -// CHECK11-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK11-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 -// CHECK11-NEXT: [[TMP28:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK11-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP27]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP28]], i32 4) -// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK11: omp.inner.for.inc: -// CHECK11-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK11-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP29]], [[TMP30]] -// CHECK11-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK11-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK11-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] -// CHECK11-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK11-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK11-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] -// CHECK11-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK11-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] -// CHECK11-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] -// CHECK11: cond.true10: -// CHECK11-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK11-NEXT: br label [[COND_END12:%.*]] -// CHECK11: cond.false11: -// CHECK11-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: br label [[COND_END12]] -// CHECK11: cond.end12: -// CHECK11-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP37]], [[COND_TRUE10]] ], [ [[TMP38]], [[COND_FALSE11]] ] -// CHECK11-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK11-NEXT: store i32 [[TMP39]], i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]] -// CHECK11: omp.inner.for.end: -// CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK11: omp.loop.exit: -// CHECK11-NEXT: [[TMP40:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK11-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 -// CHECK11-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]]) -// CHECK11-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK11-NEXT: [[TMP43:%.*]] = icmp ne i32 [[TMP42]], 0 -// CHECK11-NEXT: br i1 [[TMP43]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK11: .omp.final.then: -// CHECK11-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK11-NEXT: [[SUB14:%.*]] = sub nsw i32 [[TMP44]], 0 -// CHECK11-NEXT: [[DIV15:%.*]] = sdiv i32 [[SUB14]], 1 -// CHECK11-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV15]], 1 -// CHECK11-NEXT: [[ADD16:%.*]] = add nsw i32 0, [[MUL]] -// CHECK11-NEXT: store i32 [[ADD16]], i32* [[I3]], align 4 -// CHECK11-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK11: .omp.final.done: -// CHECK11-NEXT: br label [[OMP_PRECOND_END]] -// CHECK11: omp.precond.end: -// CHECK11-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l48 +// CHECK4-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK4-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK4-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK4-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 +// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK4-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG10]] +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK4: .execute: +// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK4-NEXT: [[CONV1:%.*]] = bitcast i64* [[F_CASTED]] to i32* +// CHECK4-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i64, i64* [[F_CASTED]], align 8 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK4-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i64 [[TMP3]]) #[[ATTR2]] +// CHECK4-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK4: .omp.deinit: +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK4-NEXT: br label [[DOTEXIT:%.*]] +// CHECK4: .exit: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__6 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK4-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK4-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK4-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 +// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK4-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG10]] +// CHECK4-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 +// CHECK4-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK4: cond.true: +// CHECK4-NEXT: br label [[COND_END:%.*]] +// CHECK4: cond.false: +// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: br label [[COND_END]] +// CHECK4: cond.end: +// CHECK4-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK4-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK4: omp.inner.for.cond: +// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 +// CHECK4-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK4: omp.inner.for.body: +// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK4-NEXT: [[TMP11:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK4-NEXT: [[CONV3:%.*]] = bitcast i64* [[F_CASTED]] to i32* +// CHECK4-NEXT: store i32 [[TMP11]], i32* [[CONV3]], align 4 +// CHECK4-NEXT: [[TMP12:%.*]] = load i64, i64* [[F_CASTED]], align 8 +// CHECK4-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK4-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP8]] to i8* +// CHECK4-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 +// CHECK4-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK4-NEXT: [[TMP16:%.*]] = inttoptr i64 [[TMP10]] to i8* +// CHECK4-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 +// CHECK4-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK4-NEXT: [[TMP18:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK4-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 8 +// CHECK4-NEXT: [[TMP19:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK4-NEXT: [[TMP20:%.*]] = inttoptr i64 [[TMP12]] to i8* +// CHECK4-NEXT: store i8* [[TMP20]], i8** [[TMP19]], align 8 +// CHECK4-NEXT: [[TMP21:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, [10 x [10 x i32]]*, i64)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP21]], i64 4) +// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK4: omp.inner.for.inc: +// CHECK4-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK4-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK4-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP26]], [[TMP27]] +// CHECK4-NEXT: store i32 [[ADD5]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP28]], 99 +// CHECK4-NEXT: br i1 [[CMP6]], label [[COND_TRUE7:%.*]], label [[COND_FALSE8:%.*]] +// CHECK4: cond.true7: +// CHECK4-NEXT: br label [[COND_END9:%.*]] +// CHECK4: cond.false8: +// CHECK4-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: br label [[COND_END9]] +// CHECK4: cond.end9: +// CHECK4-NEXT: [[COND10:%.*]] = phi i32 [ 99, [[COND_TRUE7]] ], [ [[TMP29]], [[COND_FALSE8]] ] +// CHECK4-NEXT: store i32 [[COND10]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP30]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP18:![0-9]+]] +// CHECK4: omp.inner.for.end: +// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK4: omp.loop.exit: +// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK4-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0 +// CHECK4-NEXT: br i1 [[TMP32]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK4: .omp.final.then: +// CHECK4-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK4-NEXT: store i32 10, i32* [[J]], align 4 +// CHECK4-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK4: .omp.final.done: +// CHECK4-NEXT: ret void // // -// CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__3 -// CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK11-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 -// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK11-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK11-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK11-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK11-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK11-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK11-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK11-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK11-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK11-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK11-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK11-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK11-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK11-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK11-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK11-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK11-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK11-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK11: omp.precond.then: -// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK11-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK11-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK11-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK11-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK11-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 -// CHECK11-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 -// CHECK11-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK11-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK11-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK11-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK11-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK11-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK11: omp.inner.for.cond: -// CHECK11-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK11-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] -// CHECK11-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK11: omp.inner.for.body: -// CHECK11-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 -// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK11-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 -// CHECK11-NEXT: [[TMP13:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK11-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i32 0, i32 [[TMP13]] -// CHECK11-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 -// CHECK11-NEXT: [[CONV:%.*]] = sext i16 [[TMP14]] to i32 -// CHECK11-NEXT: [[ADD5:%.*]] = add nsw i32 [[CONV]], 1 -// CHECK11-NEXT: [[CONV6:%.*]] = trunc i32 [[ADD5]] to i16 -// CHECK11-NEXT: store i16 [[CONV6]], i16* [[ARRAYIDX]], align 2 -// CHECK11-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK11: omp.body.continue: -// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK11: omp.inner.for.inc: -// CHECK11-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK11-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] -// CHECK11-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP14:![0-9]+]] -// CHECK11: omp.inner.for.end: -// CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK11: omp.loop.exit: -// CHECK11-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK11-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 -// CHECK11-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) -// CHECK11-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK11-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 -// CHECK11-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK11: .omp.final.then: -// CHECK11-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK11-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP21]], 0 -// CHECK11-NEXT: [[DIV9:%.*]] = sdiv i32 [[SUB8]], 1 -// CHECK11-NEXT: [[MUL10:%.*]] = mul nsw i32 [[DIV9]], 1 -// CHECK11-NEXT: [[ADD11:%.*]] = add nsw i32 0, [[MUL10]] -// CHECK11-NEXT: store i32 [[ADD11]], i32* [[I3]], align 4 -// CHECK11-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK11: .omp.final.done: -// CHECK11-NEXT: br label [[OMP_PRECOND_END]] -// CHECK11: omp.precond.end: -// CHECK11-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__7 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK4-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK4-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK4-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK4-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK4-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 +// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK4-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK4-NEXT: [[CONV2:%.*]] = trunc i64 [[TMP1]] to i32 +// CHECK4-NEXT: [[TMP2:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK4-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP2]] to i32 +// CHECK4-NEXT: store i32 [[CONV2]], i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK4: omp.inner.for.cond: +// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[CONV4:%.*]] = sext i32 [[TMP6]] to i64 +// CHECK4-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK4-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV4]], [[TMP7]] +// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK4: omp.inner.for.body: +// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 +// CHECK4-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 +// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK4-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[DIV5:%.*]] = sdiv i32 [[TMP10]], 10 +// CHECK4-NEXT: [[MUL6:%.*]] = mul nsw i32 [[DIV5]], 10 +// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL6]] +// CHECK4-NEXT: [[MUL7:%.*]] = mul nsw i32 [[SUB]], 1 +// CHECK4-NEXT: [[ADD8:%.*]] = add nsw i32 0, [[MUL7]] +// CHECK4-NEXT: store i32 [[ADD8]], i32* [[J]], align 4 +// CHECK4-NEXT: store i32 10, i32* [[K]], align 4 +// CHECK4-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 +// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 +// CHECK4-NEXT: [[TMP13:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK4-NEXT: [[MUL9:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] +// CHECK4-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP11]], [[MUL9]] +// CHECK4-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 +// CHECK4-NEXT: [[ADD11:%.*]] = add nsw i32 [[ADD10]], [[TMP14]] +// CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 +// CHECK4-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP15]] to i64 +// CHECK4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 +// CHECK4-NEXT: [[IDXPROM12:%.*]] = sext i32 [[TMP16]] to i64 +// CHECK4-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i64 0, i64 [[IDXPROM12]] +// CHECK4-NEXT: store i32 [[ADD11]], i32* [[ARRAYIDX13]], align 4 +// CHECK4-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK4: omp.body.continue: +// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK4: omp.inner.for.inc: +// CHECK4-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK4-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]] +// CHECK4: omp.inner.for.end: +// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK4: omp.loop.exit: +// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK4-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 +// CHECK4-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK4: .omp.final.then: +// CHECK4-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK4-NEXT: store i32 10, i32* [[J]], align 4 +// CHECK4-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK4: .omp.final.done: +// CHECK4-NEXT: ret void // // -// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 -// CHECK11-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK11-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK11-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK11-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK11-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK11-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK11-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK11: .execute: -// CHECK11-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK11-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK11-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] -// CHECK11-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK11: .omp.deinit: -// CHECK11-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK11-NEXT: br label [[DOTEXIT:%.*]] -// CHECK11: .exit: -// CHECK11-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l32 +// CHECK5-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK5-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK5-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG10:![0-9]+]] +// CHECK5-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK5-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK5: .execute: +// CHECK5-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP4]], i32* [[L_CASTED]], align 4 +// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK5-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32 [[TMP5]]) #[[ATTR2:[0-9]+]] +// CHECK5-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK5: .omp.deinit: +// CHECK5-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK5-NEXT: br label [[DOTEXIT:%.*]] +// CHECK5: .exit: +// CHECK5-NEXT: ret void // // -// CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__4 -// CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK11-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 -// CHECK11-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK11-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK11-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK11-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK11-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK11-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK11-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK11-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 -// CHECK11-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK11-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 -// CHECK11-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK11: cond.true: -// CHECK11-NEXT: br label [[COND_END:%.*]] -// CHECK11: cond.false: -// CHECK11-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: br label [[COND_END]] -// CHECK11: cond.end: -// CHECK11-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] -// CHECK11-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK11-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK11: omp.inner.for.cond: -// CHECK11-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 -// CHECK11-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK11: omp.inner.for.body: -// CHECK11-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK11-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK11-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP7]] to i8* -// CHECK11-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 -// CHECK11-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK11-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP8]] to i8* -// CHECK11-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 -// CHECK11-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK11-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* -// CHECK11-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 -// CHECK11-NEXT: [[TMP15:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK11-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP15]], i32 3) -// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK11: omp.inner.for.inc: -// CHECK11-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP17]] -// CHECK11-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK11-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK11-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] -// CHECK11-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK11-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK11-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] -// CHECK11-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP22]], 9 -// CHECK11-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] -// CHECK11: cond.true5: -// CHECK11-NEXT: br label [[COND_END7:%.*]] -// CHECK11: cond.false6: -// CHECK11-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: br label [[COND_END7]] -// CHECK11: cond.end7: -// CHECK11-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP23]], [[COND_FALSE6]] ] -// CHECK11-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK11-NEXT: store i32 [[TMP24]], i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP15:![0-9]+]] -// CHECK11: omp.inner.for.end: -// CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK11: omp.loop.exit: -// CHECK11-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) -// CHECK11-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK11-NEXT: [[TMP26:%.*]] = icmp ne i32 [[TMP25]], 0 -// CHECK11-NEXT: br i1 [[TMP26]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK11: .omp.final.then: -// CHECK11-NEXT: store i32 10, i32* [[I]], align 4 -// CHECK11-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK11: .omp.final.done: -// CHECK11-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK5-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[I4:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK5-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK5-NEXT: [[L1:%.*]] = call i8* @__kmpc_alloc_shared(i32 4) +// CHECK5-NEXT: [[L_ON_STACK:%.*]] = bitcast i8* [[L1]] to i32* +// CHECK5-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK5-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK5-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK5-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK5-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK5: omp.precond.then: +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK5-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) +// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK5-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK5-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK5: cond.true: +// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK5-NEXT: br label [[COND_END:%.*]] +// CHECK5: cond.false: +// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: br label [[COND_END]] +// CHECK5: cond.end: +// CHECK5-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK5-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK5: omp.inner.for.cond: +// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK5-NEXT: [[CMP6:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK5-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK5: omp.inner.for.body: +// CHECK5-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 +// CHECK5-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK5-NEXT: [[TMP18:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP18]], i32* [[L_CASTED]], align 4 +// CHECK5-NEXT: [[TMP19:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK5-NEXT: [[TMP20:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK5-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 +// CHECK5-NEXT: [[TMP22:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK5-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 +// CHECK5-NEXT: [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK5-NEXT: [[TMP25:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK5-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 +// CHECK5-NEXT: [[TMP26:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK5-NEXT: [[TMP27:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK5-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 4 +// CHECK5-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK5-NEXT: [[TMP29:%.*]] = inttoptr i32 [[TMP19]] to i8* +// CHECK5-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 4 +// CHECK5-NEXT: [[TMP30:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP31:%.*]] = load i32, i32* [[TMP30]], align 4 +// CHECK5-NEXT: [[TMP32:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP31]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP32]], i32 5) +// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK5: omp.inner.for.inc: +// CHECK5-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK5-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] +// CHECK5-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP37]], [[TMP38]] +// CHECK5-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK5-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP39]], [[TMP40]] +// CHECK5-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK5: cond.true11: +// CHECK5-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK5-NEXT: br label [[COND_END13:%.*]] +// CHECK5: cond.false12: +// CHECK5-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: br label [[COND_END13]] +// CHECK5: cond.end13: +// CHECK5-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP41]], [[COND_TRUE11]] ], [ [[TMP42]], [[COND_FALSE12]] ] +// CHECK5-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP43]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP11:![0-9]+]] +// CHECK5: omp.inner.for.end: +// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK5: omp.loop.exit: +// CHECK5-NEXT: [[TMP44:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP45:%.*]] = load i32, i32* [[TMP44]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP45]]) +// CHECK5-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[TMP47:%.*]] = icmp ne i32 [[TMP46]], 0 +// CHECK5-NEXT: br i1 [[TMP47]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK5: .omp.final.then: +// CHECK5-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP48]], 0 +// CHECK5-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1 +// CHECK5-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV16]], 1 +// CHECK5-NEXT: [[ADD17:%.*]] = add nsw i32 0, [[MUL]] +// CHECK5-NEXT: store i32 [[ADD17]], i32* [[I4]], align 4 +// CHECK5-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK5: .omp.final.done: +// CHECK5-NEXT: [[TMP49:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[TMP50:%.*]] = icmp ne i32 [[TMP49]], 0 +// CHECK5-NEXT: br i1 [[TMP50]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK5: .omp.lastprivate.then: +// CHECK5-NEXT: [[TMP51:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP51]], i32* [[L_ADDR]], align 4 +// CHECK5-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK5: .omp.lastprivate.done: +// CHECK5-NEXT: br label [[OMP_PRECOND_END]] +// CHECK5: omp.precond.end: +// CHECK5-NEXT: call void @__kmpc_free_shared(i8* [[L1]]) +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK5-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK5-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK5-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK5-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK5-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK5-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK5-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK5: omp.precond.then: +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK5-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) +// CHECK5-NEXT: br label [[OMP_DISPATCH_COND:%.*]] +// CHECK5: omp.dispatch.cond: +// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK5-NEXT: [[CMP4:%.*]] = icmp ugt i32 [[TMP9]], [[TMP10]] +// CHECK5-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK5: cond.true: +// CHECK5-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK5-NEXT: br label [[COND_END:%.*]] +// CHECK5: cond.false: +// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: br label [[COND_END]] +// CHECK5: cond.end: +// CHECK5-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] +// CHECK5-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] +// CHECK5-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] +// CHECK5: omp.dispatch.body: +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK5: omp.inner.for.cond: +// CHECK5-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] +// CHECK5-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK5: omp.inner.for.body: +// CHECK5-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK5-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK5-NEXT: [[TMP19:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK5-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP19]] +// CHECK5-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 +// CHECK5-NEXT: [[TMP20:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK5-NEXT: store i32 [[TMP20]], i32* [[L_ADDR]], align 4 +// CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK5: omp.body.continue: +// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK5: omp.inner.for.inc: +// CHECK5-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP21]], 1 +// CHECK5-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]] +// CHECK5: omp.inner.for.end: +// CHECK5-NEXT: br label [[OMP_DISPATCH_INC:%.*]] +// CHECK5: omp.dispatch.inc: +// CHECK5-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK5-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK5-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: br label [[OMP_DISPATCH_COND]] +// CHECK5: omp.dispatch.end: +// CHECK5-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) +// CHECK5-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 +// CHECK5-NEXT: br i1 [[TMP29]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK5: .omp.final.then: +// CHECK5-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[SUB10:%.*]] = sub nsw i32 [[TMP30]], 0 +// CHECK5-NEXT: [[DIV11:%.*]] = sdiv i32 [[SUB10]], 1 +// CHECK5-NEXT: [[MUL12:%.*]] = mul nsw i32 [[DIV11]], 1 +// CHECK5-NEXT: [[ADD13:%.*]] = add nsw i32 0, [[MUL12]] +// CHECK5-NEXT: store i32 [[ADD13]], i32* [[I3]], align 4 +// CHECK5-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK5: .omp.final.done: +// CHECK5-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0 +// CHECK5-NEXT: br i1 [[TMP32]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK5: .omp.lastprivate.then: +// CHECK5-NEXT: [[TMP33:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP33]], i32* [[L_ADDR]], align 4 +// CHECK5-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK5: .omp.lastprivate.done: +// CHECK5-NEXT: br label [[OMP_PRECOND_END]] +// CHECK5: omp.precond.end: +// CHECK5-NEXT: ret void // // -// CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__5 -// CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK11-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK11-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK11-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK11-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK11-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK11-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK11-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 -// CHECK11-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK11-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK11-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 -// CHECK11-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 -// CHECK11-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK11-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK11-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 -// CHECK11-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK11-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK11-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK11: omp.inner.for.cond: -// CHECK11-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK11-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] -// CHECK11-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK11: omp.inner.for.body: -// CHECK11-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 -// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK11-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK11-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 -// CHECK11-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 [[TMP9]] -// CHECK11-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -// CHECK11-NEXT: [[ADD1:%.*]] = add nsw i32 [[TMP10]], 1 -// CHECK11-NEXT: store i32 [[ADD1]], i32* [[ARRAYIDX]], align 4 -// CHECK11-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK11: omp.body.continue: -// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK11: omp.inner.for.inc: -// CHECK11-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK11-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] -// CHECK11-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]] -// CHECK11: omp.inner.for.end: -// CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK11: omp.loop.exit: -// CHECK11-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) -// CHECK11-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK11-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 -// CHECK11-NEXT: br i1 [[TMP14]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK11: .omp.final.then: -// CHECK11-NEXT: store i32 10, i32* [[I]], align 4 -// CHECK11-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK11: .omp.final.done: -// CHECK11-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38 +// CHECK5-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK5-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK5-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG10]] +// CHECK5-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK5-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK5: .execute: +// CHECK5-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK5-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] +// CHECK5-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK5: .omp.deinit: +// CHECK5-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK5-NEXT: br label [[DOTEXIT:%.*]] +// CHECK5: .exit: +// CHECK5-NEXT: ret void // // -// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 -// CHECK11-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK11-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK11-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK11-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 -// CHECK11-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK11-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK11-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK11-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK11: .execute: -// CHECK11-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK11-NEXT: [[TMP2:%.*]] = load i32, i32* [[F_ADDR]], align 4 -// CHECK11-NEXT: store i32 [[TMP2]], i32* [[F_CASTED]], align 4 -// CHECK11-NEXT: [[TMP3:%.*]] = load i32, i32* [[F_CASTED]], align 4 -// CHECK11-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK11-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i32 [[TMP3]]) #[[ATTR2]] -// CHECK11-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK11: .omp.deinit: -// CHECK11-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK11-NEXT: br label [[DOTEXIT:%.*]] -// CHECK11: .exit: -// CHECK11-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK5-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK5-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK5-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK5-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK5-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK5-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK5: omp.precond.then: +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK5-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG10]] +// CHECK5-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK5-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK5-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK5: cond.true: +// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK5-NEXT: br label [[COND_END:%.*]] +// CHECK5: cond.false: +// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: br label [[COND_END]] +// CHECK5: cond.end: +// CHECK5-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK5-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK5: omp.inner.for.cond: +// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK5-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK5-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK5: omp.inner.for.body: +// CHECK5-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 +// CHECK5-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK5-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP19:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK5-NEXT: store i8* [[TMP19]], i8** [[TMP18]], align 4 +// CHECK5-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK5-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 +// CHECK5-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK5-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK5-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 +// CHECK5-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK5-NEXT: [[TMP25:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* +// CHECK5-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 +// CHECK5-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK5-NEXT: [[TMP28:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP27]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP28]], i32 4) +// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK5: omp.inner.for.inc: +// CHECK5-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP29]], [[TMP30]] +// CHECK5-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] +// CHECK5-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK5-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK5-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] +// CHECK5-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] +// CHECK5: cond.true10: +// CHECK5-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK5-NEXT: br label [[COND_END12:%.*]] +// CHECK5: cond.false11: +// CHECK5-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: br label [[COND_END12]] +// CHECK5: cond.end12: +// CHECK5-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP37]], [[COND_TRUE10]] ], [ [[TMP38]], [[COND_FALSE11]] ] +// CHECK5-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP39]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP14:![0-9]+]] +// CHECK5: omp.inner.for.end: +// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK5: omp.loop.exit: +// CHECK5-NEXT: [[TMP40:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]]) +// CHECK5-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[TMP43:%.*]] = icmp ne i32 [[TMP42]], 0 +// CHECK5-NEXT: br i1 [[TMP43]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK5: .omp.final.then: +// CHECK5-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[SUB14:%.*]] = sub nsw i32 [[TMP44]], 0 +// CHECK5-NEXT: [[DIV15:%.*]] = sdiv i32 [[SUB14]], 1 +// CHECK5-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV15]], 1 +// CHECK5-NEXT: [[ADD16:%.*]] = add nsw i32 0, [[MUL]] +// CHECK5-NEXT: store i32 [[ADD16]], i32* [[I3]], align 4 +// CHECK5-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK5: .omp.final.done: +// CHECK5-NEXT: br label [[OMP_PRECOND_END]] +// CHECK5: omp.precond.end: +// CHECK5-NEXT: ret void // // -// CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__6 -// CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK11-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK11-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[K:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 -// CHECK11-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK11-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK11-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK11-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 -// CHECK11-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK11-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK11-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK11-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK11-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 -// CHECK11-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK11-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 -// CHECK11-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK11: cond.true: -// CHECK11-NEXT: br label [[COND_END:%.*]] -// CHECK11: cond.false: -// CHECK11-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: br label [[COND_END]] -// CHECK11: cond.end: -// CHECK11-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] -// CHECK11-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK11-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK11: omp.inner.for.cond: -// CHECK11-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 -// CHECK11-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK11: omp.inner.for.body: -// CHECK11-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK11-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: [[TMP9:%.*]] = load i32, i32* [[F_ADDR]], align 4 -// CHECK11-NEXT: store i32 [[TMP9]], i32* [[F_CASTED]], align 4 -// CHECK11-NEXT: [[TMP10:%.*]] = load i32, i32* [[F_CASTED]], align 4 -// CHECK11-NEXT: [[TMP11:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK11-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP7]] to i8* -// CHECK11-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 -// CHECK11-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK11-NEXT: [[TMP14:%.*]] = inttoptr i32 [[TMP8]] to i8* -// CHECK11-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 -// CHECK11-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK11-NEXT: [[TMP16:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* -// CHECK11-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 4 -// CHECK11-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK11-NEXT: [[TMP18:%.*]] = inttoptr i32 [[TMP10]] to i8* -// CHECK11-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 4 -// CHECK11-NEXT: [[TMP19:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK11-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x [10 x i32]]*, i32)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP19]], i32 4) -// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK11: omp.inner.for.inc: -// CHECK11-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] -// CHECK11-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK11-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK11-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK11-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK11-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK11-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] -// CHECK11-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP26]], 99 -// CHECK11-NEXT: br i1 [[CMP5]], label [[COND_TRUE6:%.*]], label [[COND_FALSE7:%.*]] -// CHECK11: cond.true6: -// CHECK11-NEXT: br label [[COND_END8:%.*]] -// CHECK11: cond.false7: -// CHECK11-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: br label [[COND_END8]] -// CHECK11: cond.end8: -// CHECK11-NEXT: [[COND9:%.*]] = phi i32 [ 99, [[COND_TRUE6]] ], [ [[TMP27]], [[COND_FALSE7]] ] -// CHECK11-NEXT: store i32 [[COND9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK11-NEXT: store i32 [[TMP28]], i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]] -// CHECK11: omp.inner.for.end: -// CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK11: omp.loop.exit: -// CHECK11-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) -// CHECK11-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK11-NEXT: [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0 -// CHECK11-NEXT: br i1 [[TMP30]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK11: .omp.final.then: -// CHECK11-NEXT: store i32 10, i32* [[I]], align 4 -// CHECK11-NEXT: store i32 10, i32* [[J]], align 4 -// CHECK11-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK11: .omp.final.done: -// CHECK11-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK5-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK5-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK5-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK5-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK5-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK5-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK5: omp.precond.then: +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK5-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK5: omp.inner.for.cond: +// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK5-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] +// CHECK5-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK5: omp.inner.for.body: +// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK5-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK5-NEXT: [[TMP13:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK5-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i32 0, i32 [[TMP13]] +// CHECK5-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +// CHECK5-NEXT: [[CONV:%.*]] = sext i16 [[TMP14]] to i32 +// CHECK5-NEXT: [[ADD5:%.*]] = add nsw i32 [[CONV]], 1 +// CHECK5-NEXT: [[CONV6:%.*]] = trunc i32 [[ADD5]] to i16 +// CHECK5-NEXT: store i16 [[CONV6]], i16* [[ARRAYIDX]], align 2 +// CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK5: omp.body.continue: +// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK5: omp.inner.for.inc: +// CHECK5-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] +// CHECK5-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP15:![0-9]+]] +// CHECK5: omp.inner.for.end: +// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK5: omp.loop.exit: +// CHECK5-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) +// CHECK5-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 +// CHECK5-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK5: .omp.final.then: +// CHECK5-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP21]], 0 +// CHECK5-NEXT: [[DIV9:%.*]] = sdiv i32 [[SUB8]], 1 +// CHECK5-NEXT: [[MUL10:%.*]] = mul nsw i32 [[DIV9]], 1 +// CHECK5-NEXT: [[ADD11:%.*]] = add nsw i32 0, [[MUL10]] +// CHECK5-NEXT: store i32 [[ADD11]], i32* [[I3]], align 4 +// CHECK5-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK5: .omp.final.done: +// CHECK5-NEXT: br label [[OMP_PRECOND_END]] +// CHECK5: omp.precond.end: +// CHECK5-NEXT: ret void // // -// CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__7 -// CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK11-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK11-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[K:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK11-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK11-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK11-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK11-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK11-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 -// CHECK11-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK11-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 -// CHECK11-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK11-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK11-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 -// CHECK11-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 -// CHECK11-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK11-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK11-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 -// CHECK11-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK11-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK11-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK11: omp.inner.for.cond: -// CHECK11-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK11-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] -// CHECK11-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK11: omp.inner.for.body: -// CHECK11-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 -// CHECK11-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 -// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK11-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK11-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: [[DIV2:%.*]] = sdiv i32 [[TMP10]], 10 -// CHECK11-NEXT: [[MUL3:%.*]] = mul nsw i32 [[DIV2]], 10 -// CHECK11-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL3]] -// CHECK11-NEXT: [[MUL4:%.*]] = mul nsw i32 [[SUB]], 1 -// CHECK11-NEXT: [[ADD5:%.*]] = add nsw i32 0, [[MUL4]] -// CHECK11-NEXT: store i32 [[ADD5]], i32* [[J]], align 4 -// CHECK11-NEXT: store i32 10, i32* [[K]], align 4 -// CHECK11-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 -// CHECK11-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 -// CHECK11-NEXT: [[TMP13:%.*]] = load i32, i32* [[F_ADDR]], align 4 -// CHECK11-NEXT: [[MUL6:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] -// CHECK11-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP11]], [[MUL6]] -// CHECK11-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 -// CHECK11-NEXT: [[ADD8:%.*]] = add nsw i32 [[ADD7]], [[TMP14]] -// CHECK11-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 -// CHECK11-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP15]] -// CHECK11-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 -// CHECK11-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP16]] -// CHECK11-NEXT: store i32 [[ADD8]], i32* [[ARRAYIDX9]], align 4 -// CHECK11-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK11: omp.body.continue: -// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK11: omp.inner.for.inc: -// CHECK11-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK11-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] -// CHECK11-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP18:![0-9]+]] -// CHECK11: omp.inner.for.end: -// CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK11: omp.loop.exit: -// CHECK11-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) -// CHECK11-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK11-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 -// CHECK11-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK11: .omp.final.then: -// CHECK11-NEXT: store i32 10, i32* [[I]], align 4 -// CHECK11-NEXT: store i32 10, i32* [[J]], align 4 -// CHECK11-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK11: .omp.final.done: -// CHECK11-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43 +// CHECK5-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK5-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG10]] +// CHECK5-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK5-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK5: .execute: +// CHECK5-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK5-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] +// CHECK5-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK5: .omp.deinit: +// CHECK5-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK5-NEXT: br label [[DOTEXIT:%.*]] +// CHECK5: .exit: +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK5-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG10]] +// CHECK5-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 +// CHECK5-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK5: cond.true: +// CHECK5-NEXT: br label [[COND_END:%.*]] +// CHECK5: cond.false: +// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: br label [[COND_END]] +// CHECK5: cond.end: +// CHECK5-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK5-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK5: omp.inner.for.cond: +// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 +// CHECK5-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK5: omp.inner.for.body: +// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK5-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK5-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK5-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK5-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK5-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK5-NEXT: [[TMP15:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP15]], i32 3) +// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK5: omp.inner.for.inc: +// CHECK5-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP17]] +// CHECK5-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] +// CHECK5-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK5-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP22]], 9 +// CHECK5-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] +// CHECK5: cond.true5: +// CHECK5-NEXT: br label [[COND_END7:%.*]] +// CHECK5: cond.false6: +// CHECK5-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: br label [[COND_END7]] +// CHECK5: cond.end7: +// CHECK5-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP23]], [[COND_FALSE6]] ] +// CHECK5-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP24]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]] +// CHECK5: omp.inner.for.end: +// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK5: omp.loop.exit: +// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK5-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[TMP26:%.*]] = icmp ne i32 [[TMP25]], 0 +// CHECK5-NEXT: br i1 [[TMP26]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK5: .omp.final.then: +// CHECK5-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK5-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK5: .omp.final.done: +// CHECK5-NEXT: ret void // // -// CHECK12-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38 -// CHECK12-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] { -// CHECK12-NEXT: entry: -// CHECK12-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK12-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK12-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK12-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK12-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 -// CHECK12-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK12-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK12-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK12-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK12: .execute: -// CHECK12-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) -// CHECK12-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK12-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 -// CHECK12-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK12-NEXT: [[TMP4:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK12-NEXT: store i32 [[TMP4]], i32* [[L_CASTED]], align 4 -// CHECK12-NEXT: [[TMP5:%.*]] = load i32, i32* [[L_CASTED]], align 4 -// CHECK12-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK12-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32 [[TMP5]]) #[[ATTR2:[0-9]+]] -// CHECK12-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK12: .omp.deinit: -// CHECK12-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK12-NEXT: br label [[DOTEXIT:%.*]] -// CHECK12: .exit: -// CHECK12-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__5 +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK5-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK5: omp.inner.for.cond: +// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK5-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK5-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK5: omp.inner.for.body: +// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 +// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK5-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 +// CHECK5-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 [[TMP9]] +// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK5-NEXT: [[ADD1:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK5-NEXT: store i32 [[ADD1]], i32* [[ARRAYIDX]], align 4 +// CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK5: omp.body.continue: +// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK5: omp.inner.for.inc: +// CHECK5-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] +// CHECK5-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]] +// CHECK5: omp.inner.for.end: +// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK5: omp.loop.exit: +// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK5-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 +// CHECK5-NEXT: br i1 [[TMP14]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK5: .omp.final.then: +// CHECK5-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK5-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK5: .omp.final.done: +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l48 +// CHECK5-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK5-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK5-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG10]] +// CHECK5-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK5-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK5: .execute: +// CHECK5-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP2]], i32* [[F_CASTED]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK5-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i32 [[TMP3]]) #[[ATTR2]] +// CHECK5-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK5: .omp.deinit: +// CHECK5-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK5-NEXT: br label [[DOTEXIT:%.*]] +// CHECK5: .exit: +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__6 +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK5-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK5-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG10]] +// CHECK5-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 +// CHECK5-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK5: cond.true: +// CHECK5-NEXT: br label [[COND_END:%.*]] +// CHECK5: cond.false: +// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: br label [[COND_END]] +// CHECK5: cond.end: +// CHECK5-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK5-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK5: omp.inner.for.cond: +// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 +// CHECK5-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK5: omp.inner.for.body: +// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP9]], i32* [[F_CASTED]], align 4 +// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK5-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK5-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP14:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK5-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK5-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK5-NEXT: [[TMP16:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK5-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 4 +// CHECK5-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK5-NEXT: [[TMP18:%.*]] = inttoptr i32 [[TMP10]] to i8* +// CHECK5-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 4 +// CHECK5-NEXT: [[TMP19:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x [10 x i32]]*, i32)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP19]], i32 4) +// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK5: omp.inner.for.inc: +// CHECK5-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK5-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK5-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK5-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP26]], 99 +// CHECK5-NEXT: br i1 [[CMP5]], label [[COND_TRUE6:%.*]], label [[COND_FALSE7:%.*]] +// CHECK5: cond.true6: +// CHECK5-NEXT: br label [[COND_END8:%.*]] +// CHECK5: cond.false7: +// CHECK5-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: br label [[COND_END8]] +// CHECK5: cond.end8: +// CHECK5-NEXT: [[COND9:%.*]] = phi i32 [ 99, [[COND_TRUE6]] ], [ [[TMP27]], [[COND_FALSE7]] ] +// CHECK5-NEXT: store i32 [[COND9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP28]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP18:![0-9]+]] +// CHECK5: omp.inner.for.end: +// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK5: omp.loop.exit: +// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK5-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0 +// CHECK5-NEXT: br i1 [[TMP30]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK5: .omp.final.then: +// CHECK5-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK5-NEXT: store i32 10, i32* [[J]], align 4 +// CHECK5-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK5: .omp.final.done: +// CHECK5-NEXT: ret void // // -// CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { -// CHECK12-NEXT: entry: -// CHECK12-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK12-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK12-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK12-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[I4:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 -// CHECK12-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK12-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK12-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK12-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK12-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 -// CHECK12-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK12-NEXT: [[TMP1:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 4, i16 1) -// CHECK12-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to %struct._globalized_locals_ty* -// CHECK12-NEXT: [[L1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP2]], i32 0, i32 0 -// CHECK12-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK12-NEXT: store i32 [[TMP3]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK12-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK12-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP4]], 0 -// CHECK12-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK12-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK12-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK12-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK12-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK12-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] -// CHECK12-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK12: omp.precond.then: -// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK12-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK12-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK12-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK12-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK12-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP8]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) -// CHECK12-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK12-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]] -// CHECK12-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK12: cond.true: -// CHECK12-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK12-NEXT: br label [[COND_END:%.*]] -// CHECK12: cond.false: -// CHECK12-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: br label [[COND_END]] -// CHECK12: cond.end: -// CHECK12-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] -// CHECK12-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK12-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK12: omp.inner.for.cond: -// CHECK12-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK12-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP15]], 1 -// CHECK12-NEXT: [[CMP6:%.*]] = icmp slt i32 [[TMP14]], [[ADD]] -// CHECK12-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK12: omp.inner.for.body: -// CHECK12-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK12-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: [[TMP18:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK12-NEXT: store i32 [[TMP18]], i32* [[N_CASTED]], align 4 -// CHECK12-NEXT: [[TMP19:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK12-NEXT: [[TMP20:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK12-NEXT: store i32 [[TMP20]], i32* [[L_CASTED]], align 4 -// CHECK12-NEXT: [[TMP21:%.*]] = load i32, i32* [[L_CASTED]], align 4 -// CHECK12-NEXT: [[TMP22:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK12-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP16]] to i8* -// CHECK12-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 -// CHECK12-NEXT: [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK12-NEXT: [[TMP25:%.*]] = inttoptr i32 [[TMP17]] to i8* -// CHECK12-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 -// CHECK12-NEXT: [[TMP26:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK12-NEXT: [[TMP27:%.*]] = inttoptr i32 [[TMP19]] to i8* -// CHECK12-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 4 -// CHECK12-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK12-NEXT: [[TMP29:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* -// CHECK12-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 4 -// CHECK12-NEXT: [[TMP30:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 -// CHECK12-NEXT: [[TMP31:%.*]] = inttoptr i32 [[TMP21]] to i8* -// CHECK12-NEXT: store i8* [[TMP31]], i8** [[TMP30]], align 4 -// CHECK12-NEXT: [[TMP32:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK12-NEXT: [[TMP33:%.*]] = load i32, i32* [[TMP32]], align 4 -// CHECK12-NEXT: [[TMP34:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK12-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP33]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP34]], i32 5) -// CHECK12-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK12: omp.inner.for.inc: -// CHECK12-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK12-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] -// CHECK12-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK12-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK12-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP37]], [[TMP38]] -// CHECK12-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK12-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK12-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP39]], [[TMP40]] -// CHECK12-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK12-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]] -// CHECK12-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] -// CHECK12: cond.true11: -// CHECK12-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK12-NEXT: br label [[COND_END13:%.*]] -// CHECK12: cond.false12: -// CHECK12-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: br label [[COND_END13]] -// CHECK12: cond.end13: -// CHECK12-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP43]], [[COND_TRUE11]] ], [ [[TMP44]], [[COND_FALSE12]] ] -// CHECK12-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK12-NEXT: store i32 [[TMP45]], i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP10:![0-9]+]] -// CHECK12: omp.inner.for.end: -// CHECK12-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK12: omp.loop.exit: -// CHECK12-NEXT: [[TMP46:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK12-NEXT: [[TMP47:%.*]] = load i32, i32* [[TMP46]], align 4 -// CHECK12-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP47]]) -// CHECK12-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK12-NEXT: [[TMP49:%.*]] = icmp ne i32 [[TMP48]], 0 -// CHECK12-NEXT: br i1 [[TMP49]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK12: .omp.final.then: -// CHECK12-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK12-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP50]], 0 -// CHECK12-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1 -// CHECK12-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV16]], 1 -// CHECK12-NEXT: [[ADD17:%.*]] = add nsw i32 0, [[MUL]] -// CHECK12-NEXT: store i32 [[ADD17]], i32* [[I4]], align 4 -// CHECK12-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK12: .omp.final.done: -// CHECK12-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK12-NEXT: [[TMP52:%.*]] = icmp ne i32 [[TMP51]], 0 -// CHECK12-NEXT: br i1 [[TMP52]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK12: .omp.lastprivate.then: -// CHECK12-NEXT: [[TMP53:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK12-NEXT: store i32 [[TMP53]], i32* [[L_ADDR]], align 4 -// CHECK12-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK12: .omp.lastprivate.done: -// CHECK12-NEXT: br label [[OMP_PRECOND_END]] -// CHECK12: omp.precond.end: -// CHECK12-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP1]]) -// CHECK12-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__7 +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK5-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK5-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK5: omp.inner.for.cond: +// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK5-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK5-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK5: omp.inner.for.body: +// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 +// CHECK5-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 +// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK5-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[DIV2:%.*]] = sdiv i32 [[TMP10]], 10 +// CHECK5-NEXT: [[MUL3:%.*]] = mul nsw i32 [[DIV2]], 10 +// CHECK5-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL3]] +// CHECK5-NEXT: [[MUL4:%.*]] = mul nsw i32 [[SUB]], 1 +// CHECK5-NEXT: [[ADD5:%.*]] = add nsw i32 0, [[MUL4]] +// CHECK5-NEXT: store i32 [[ADD5]], i32* [[J]], align 4 +// CHECK5-NEXT: store i32 10, i32* [[K]], align 4 +// CHECK5-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 +// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 +// CHECK5-NEXT: [[TMP13:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK5-NEXT: [[MUL6:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] +// CHECK5-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP11]], [[MUL6]] +// CHECK5-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 +// CHECK5-NEXT: [[ADD8:%.*]] = add nsw i32 [[ADD7]], [[TMP14]] +// CHECK5-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 +// CHECK5-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP15]] +// CHECK5-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 +// CHECK5-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP16]] +// CHECK5-NEXT: store i32 [[ADD8]], i32* [[ARRAYIDX9]], align 4 +// CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK5: omp.body.continue: +// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK5: omp.inner.for.inc: +// CHECK5-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK5-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]] +// CHECK5: omp.inner.for.end: +// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK5: omp.loop.exit: +// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK5-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 +// CHECK5-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK5: .omp.final.then: +// CHECK5-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK5-NEXT: store i32 10, i32* [[J]], align 4 +// CHECK5-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK5: .omp.final.done: +// CHECK5-NEXT: ret void // // -// CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { -// CHECK12-NEXT: entry: -// CHECK12-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK12-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK12-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK12-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK12-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK12-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK12-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK12-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK12-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK12-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 -// CHECK12-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK12-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK12-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK12-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK12-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK12-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK12-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK12-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK12-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK12-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK12-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK12-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK12: omp.precond.then: -// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK12-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK12-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK12-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK12-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK12-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 -// CHECK12-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 -// CHECK12-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK12-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK12-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK12-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) -// CHECK12-NEXT: br label [[OMP_DISPATCH_COND:%.*]] -// CHECK12: omp.dispatch.cond: -// CHECK12-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK12-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK12-NEXT: [[CMP4:%.*]] = icmp ugt i32 [[TMP9]], [[TMP10]] -// CHECK12-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK12: cond.true: -// CHECK12-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK12-NEXT: br label [[COND_END:%.*]] -// CHECK12: cond.false: -// CHECK12-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK12-NEXT: br label [[COND_END]] -// CHECK12: cond.end: -// CHECK12-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] -// CHECK12-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 -// CHECK12-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK12-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK12-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] -// CHECK12-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] -// CHECK12: omp.dispatch.body: -// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK12: omp.inner.for.cond: -// CHECK12-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK12-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] -// CHECK12-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK12: omp.inner.for.body: -// CHECK12-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 -// CHECK12-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK12-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 -// CHECK12-NEXT: [[TMP19:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK12-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP19]] -// CHECK12-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 -// CHECK12-NEXT: [[TMP20:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK12-NEXT: store i32 [[TMP20]], i32* [[L_ADDR]], align 4 -// CHECK12-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK12: omp.body.continue: -// CHECK12-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK12: omp.inner.for.inc: -// CHECK12-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP21]], 1 -// CHECK12-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP12:![0-9]+]] -// CHECK12: omp.inner.for.end: -// CHECK12-NEXT: br label [[OMP_DISPATCH_INC:%.*]] -// CHECK12: omp.dispatch.inc: -// CHECK12-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK12-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK12-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK12-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_LB]], align 4 -// CHECK12-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK12-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK12-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] -// CHECK12-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_UB]], align 4 -// CHECK12-NEXT: br label [[OMP_DISPATCH_COND]] -// CHECK12: omp.dispatch.end: -// CHECK12-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK12-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 -// CHECK12-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) -// CHECK12-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK12-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 -// CHECK12-NEXT: br i1 [[TMP29]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK12: .omp.final.then: -// CHECK12-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK12-NEXT: [[SUB10:%.*]] = sub nsw i32 [[TMP30]], 0 -// CHECK12-NEXT: [[DIV11:%.*]] = sdiv i32 [[SUB10]], 1 -// CHECK12-NEXT: [[MUL12:%.*]] = mul nsw i32 [[DIV11]], 1 -// CHECK12-NEXT: [[ADD13:%.*]] = add nsw i32 0, [[MUL12]] -// CHECK12-NEXT: store i32 [[ADD13]], i32* [[I3]], align 4 -// CHECK12-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK12: .omp.final.done: -// CHECK12-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK12-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0 -// CHECK12-NEXT: br i1 [[TMP32]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK12: .omp.lastprivate.then: -// CHECK12-NEXT: [[TMP33:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK12-NEXT: store i32 [[TMP33]], i32* [[L_ADDR]], align 4 -// CHECK12-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK12: .omp.lastprivate.done: -// CHECK12-NEXT: br label [[OMP_PRECOND_END]] -// CHECK12: omp.precond.end: -// CHECK12-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l32 +// CHECK6-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK6-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK6-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG10:![0-9]+]] +// CHECK6-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK6-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK6: .execute: +// CHECK6-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP4]], i32* [[L_CASTED]], align 4 +// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK6-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32 [[TMP5]]) #[[ATTR2:[0-9]+]] +// CHECK6-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK6: .omp.deinit: +// CHECK6-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK6-NEXT: br label [[DOTEXIT:%.*]] +// CHECK6: .exit: +// CHECK6-NEXT: ret void // // -// CHECK12-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44 -// CHECK12-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK12-NEXT: entry: -// CHECK12-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 -// CHECK12-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK12-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK12-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK12-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK12-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK12-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK12-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK12: .execute: -// CHECK12-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK12-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK12-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 -// CHECK12-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK12-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK12-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] -// CHECK12-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK12: .omp.deinit: -// CHECK12-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK12-NEXT: br label [[DOTEXIT:%.*]] -// CHECK12: .exit: -// CHECK12-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK6-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[I4:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK6-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK6-NEXT: [[L1:%.*]] = call i8* @__kmpc_alloc_shared(i32 4) +// CHECK6-NEXT: [[L_ON_STACK:%.*]] = bitcast i8* [[L1]] to i32* +// CHECK6-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK6-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK6-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK6-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK6-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK6: omp.precond.then: +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK6-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) +// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK6-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK6-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK6: cond.true: +// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK6-NEXT: br label [[COND_END:%.*]] +// CHECK6: cond.false: +// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: br label [[COND_END]] +// CHECK6: cond.end: +// CHECK6-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK6-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK6: omp.inner.for.cond: +// CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK6-NEXT: [[CMP6:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK6-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK6: omp.inner.for.body: +// CHECK6-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 +// CHECK6-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK6-NEXT: [[TMP18:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP18]], i32* [[L_CASTED]], align 4 +// CHECK6-NEXT: [[TMP19:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK6-NEXT: [[TMP20:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK6-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 +// CHECK6-NEXT: [[TMP22:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK6-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 +// CHECK6-NEXT: [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK6-NEXT: [[TMP25:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK6-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 +// CHECK6-NEXT: [[TMP26:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK6-NEXT: [[TMP27:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK6-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 4 +// CHECK6-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK6-NEXT: [[TMP29:%.*]] = inttoptr i32 [[TMP19]] to i8* +// CHECK6-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 4 +// CHECK6-NEXT: [[TMP30:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP31:%.*]] = load i32, i32* [[TMP30]], align 4 +// CHECK6-NEXT: [[TMP32:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP31]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP32]], i32 5) +// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK6: omp.inner.for.inc: +// CHECK6-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK6-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] +// CHECK6-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP37]], [[TMP38]] +// CHECK6-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK6-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP39]], [[TMP40]] +// CHECK6-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK6: cond.true11: +// CHECK6-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK6-NEXT: br label [[COND_END13:%.*]] +// CHECK6: cond.false12: +// CHECK6-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: br label [[COND_END13]] +// CHECK6: cond.end13: +// CHECK6-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP41]], [[COND_TRUE11]] ], [ [[TMP42]], [[COND_FALSE12]] ] +// CHECK6-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP43]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP11:![0-9]+]] +// CHECK6: omp.inner.for.end: +// CHECK6-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK6: omp.loop.exit: +// CHECK6-NEXT: [[TMP44:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP45:%.*]] = load i32, i32* [[TMP44]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP45]]) +// CHECK6-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[TMP47:%.*]] = icmp ne i32 [[TMP46]], 0 +// CHECK6-NEXT: br i1 [[TMP47]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK6: .omp.final.then: +// CHECK6-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP48]], 0 +// CHECK6-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1 +// CHECK6-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV16]], 1 +// CHECK6-NEXT: [[ADD17:%.*]] = add nsw i32 0, [[MUL]] +// CHECK6-NEXT: store i32 [[ADD17]], i32* [[I4]], align 4 +// CHECK6-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK6: .omp.final.done: +// CHECK6-NEXT: [[TMP49:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[TMP50:%.*]] = icmp ne i32 [[TMP49]], 0 +// CHECK6-NEXT: br i1 [[TMP50]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK6: .omp.lastprivate.then: +// CHECK6-NEXT: [[TMP51:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP51]], i32* [[L_ADDR]], align 4 +// CHECK6-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK6: .omp.lastprivate.done: +// CHECK6-NEXT: br label [[OMP_PRECOND_END]] +// CHECK6: omp.precond.end: +// CHECK6-NEXT: call void @__kmpc_free_shared(i8* [[L1]]) +// CHECK6-NEXT: ret void // // -// CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__2 -// CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK12-NEXT: entry: -// CHECK12-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK12-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK12-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 -// CHECK12-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 -// CHECK12-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK12-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK12-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK12-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK12-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK12-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK12-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK12-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK12-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK12-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK12-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK12-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK12-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK12-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK12-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK12-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK12: omp.precond.then: -// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK12-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK12-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK12-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK12-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK12-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 -// CHECK12-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK12-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK12-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] -// CHECK12-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK12: cond.true: -// CHECK12-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK12-NEXT: br label [[COND_END:%.*]] -// CHECK12: cond.false: -// CHECK12-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: br label [[COND_END]] -// CHECK12: cond.end: -// CHECK12-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] -// CHECK12-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK12-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK12: omp.inner.for.cond: -// CHECK12-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK12-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 -// CHECK12-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] -// CHECK12-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK12: omp.inner.for.body: -// CHECK12-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK12-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK12-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 -// CHECK12-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK12-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK12-NEXT: [[TMP19:%.*]] = inttoptr i32 [[TMP14]] to i8* -// CHECK12-NEXT: store i8* [[TMP19]], i8** [[TMP18]], align 4 -// CHECK12-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK12-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP15]] to i8* -// CHECK12-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 -// CHECK12-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK12-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP17]] to i8* -// CHECK12-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 -// CHECK12-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK12-NEXT: [[TMP25:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* -// CHECK12-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 -// CHECK12-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK12-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 -// CHECK12-NEXT: [[TMP28:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK12-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP27]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP28]], i32 4) -// CHECK12-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK12: omp.inner.for.inc: -// CHECK12-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK12-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP29]], [[TMP30]] -// CHECK12-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK12-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK12-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] -// CHECK12-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK12-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK12-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] -// CHECK12-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK12-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] -// CHECK12-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] -// CHECK12: cond.true10: -// CHECK12-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK12-NEXT: br label [[COND_END12:%.*]] -// CHECK12: cond.false11: -// CHECK12-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: br label [[COND_END12]] -// CHECK12: cond.end12: -// CHECK12-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP37]], [[COND_TRUE10]] ], [ [[TMP38]], [[COND_FALSE11]] ] -// CHECK12-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK12-NEXT: store i32 [[TMP39]], i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]] -// CHECK12: omp.inner.for.end: -// CHECK12-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK12: omp.loop.exit: -// CHECK12-NEXT: [[TMP40:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK12-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 -// CHECK12-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]]) -// CHECK12-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK12-NEXT: [[TMP43:%.*]] = icmp ne i32 [[TMP42]], 0 -// CHECK12-NEXT: br i1 [[TMP43]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK12: .omp.final.then: -// CHECK12-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK12-NEXT: [[SUB14:%.*]] = sub nsw i32 [[TMP44]], 0 -// CHECK12-NEXT: [[DIV15:%.*]] = sdiv i32 [[SUB14]], 1 -// CHECK12-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV15]], 1 -// CHECK12-NEXT: [[ADD16:%.*]] = add nsw i32 0, [[MUL]] -// CHECK12-NEXT: store i32 [[ADD16]], i32* [[I3]], align 4 -// CHECK12-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK12: .omp.final.done: -// CHECK12-NEXT: br label [[OMP_PRECOND_END]] -// CHECK12: omp.precond.end: -// CHECK12-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK6-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK6-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK6-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK6-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK6-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK6-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK6-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK6: omp.precond.then: +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK6-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) +// CHECK6-NEXT: br label [[OMP_DISPATCH_COND:%.*]] +// CHECK6: omp.dispatch.cond: +// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK6-NEXT: [[CMP4:%.*]] = icmp ugt i32 [[TMP9]], [[TMP10]] +// CHECK6-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK6: cond.true: +// CHECK6-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK6-NEXT: br label [[COND_END:%.*]] +// CHECK6: cond.false: +// CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: br label [[COND_END]] +// CHECK6: cond.end: +// CHECK6-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] +// CHECK6-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] +// CHECK6-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] +// CHECK6: omp.dispatch.body: +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK6: omp.inner.for.cond: +// CHECK6-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] +// CHECK6-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK6: omp.inner.for.body: +// CHECK6-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK6-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK6-NEXT: [[TMP19:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK6-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP19]] +// CHECK6-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 +// CHECK6-NEXT: [[TMP20:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK6-NEXT: store i32 [[TMP20]], i32* [[L_ADDR]], align 4 +// CHECK6-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK6: omp.body.continue: +// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK6: omp.inner.for.inc: +// CHECK6-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP21]], 1 +// CHECK6-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]] +// CHECK6: omp.inner.for.end: +// CHECK6-NEXT: br label [[OMP_DISPATCH_INC:%.*]] +// CHECK6: omp.dispatch.inc: +// CHECK6-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK6-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK6-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: br label [[OMP_DISPATCH_COND]] +// CHECK6: omp.dispatch.end: +// CHECK6-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) +// CHECK6-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 +// CHECK6-NEXT: br i1 [[TMP29]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK6: .omp.final.then: +// CHECK6-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[SUB10:%.*]] = sub nsw i32 [[TMP30]], 0 +// CHECK6-NEXT: [[DIV11:%.*]] = sdiv i32 [[SUB10]], 1 +// CHECK6-NEXT: [[MUL12:%.*]] = mul nsw i32 [[DIV11]], 1 +// CHECK6-NEXT: [[ADD13:%.*]] = add nsw i32 0, [[MUL12]] +// CHECK6-NEXT: store i32 [[ADD13]], i32* [[I3]], align 4 +// CHECK6-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK6: .omp.final.done: +// CHECK6-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0 +// CHECK6-NEXT: br i1 [[TMP32]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK6: .omp.lastprivate.then: +// CHECK6-NEXT: [[TMP33:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP33]], i32* [[L_ADDR]], align 4 +// CHECK6-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK6: .omp.lastprivate.done: +// CHECK6-NEXT: br label [[OMP_PRECOND_END]] +// CHECK6: omp.precond.end: +// CHECK6-NEXT: ret void // // -// CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__3 -// CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK12-NEXT: entry: -// CHECK12-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK12-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK12-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 -// CHECK12-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK12-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK12-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK12-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK12-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK12-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK12-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK12-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK12-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK12-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK12-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK12-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK12-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK12-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK12-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK12-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK12-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK12-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK12: omp.precond.then: -// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK12-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK12-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK12-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK12-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK12-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 -// CHECK12-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 -// CHECK12-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK12-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK12-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK12-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK12-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK12-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK12: omp.inner.for.cond: -// CHECK12-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK12-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] -// CHECK12-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK12: omp.inner.for.body: -// CHECK12-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 -// CHECK12-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK12-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 -// CHECK12-NEXT: [[TMP13:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK12-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i32 0, i32 [[TMP13]] -// CHECK12-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 -// CHECK12-NEXT: [[CONV:%.*]] = sext i16 [[TMP14]] to i32 -// CHECK12-NEXT: [[ADD5:%.*]] = add nsw i32 [[CONV]], 1 -// CHECK12-NEXT: [[CONV6:%.*]] = trunc i32 [[ADD5]] to i16 -// CHECK12-NEXT: store i16 [[CONV6]], i16* [[ARRAYIDX]], align 2 -// CHECK12-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK12: omp.body.continue: -// CHECK12-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK12: omp.inner.for.inc: -// CHECK12-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK12-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] -// CHECK12-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP14:![0-9]+]] -// CHECK12: omp.inner.for.end: -// CHECK12-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK12: omp.loop.exit: -// CHECK12-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK12-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 -// CHECK12-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) -// CHECK12-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK12-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 -// CHECK12-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK12: .omp.final.then: -// CHECK12-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK12-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP21]], 0 -// CHECK12-NEXT: [[DIV9:%.*]] = sdiv i32 [[SUB8]], 1 -// CHECK12-NEXT: [[MUL10:%.*]] = mul nsw i32 [[DIV9]], 1 -// CHECK12-NEXT: [[ADD11:%.*]] = add nsw i32 0, [[MUL10]] -// CHECK12-NEXT: store i32 [[ADD11]], i32* [[I3]], align 4 -// CHECK12-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK12: .omp.final.done: -// CHECK12-NEXT: br label [[OMP_PRECOND_END]] -// CHECK12: omp.precond.end: -// CHECK12-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38 +// CHECK6-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK6-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK6-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG10]] +// CHECK6-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK6-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK6: .execute: +// CHECK6-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK6-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] +// CHECK6-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK6: .omp.deinit: +// CHECK6-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK6-NEXT: br label [[DOTEXIT:%.*]] +// CHECK6: .exit: +// CHECK6-NEXT: ret void // // -// CHECK12-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 -// CHECK12-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK12-NEXT: entry: -// CHECK12-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK12-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK12-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK12-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK12-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK12-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK12-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK12: .execute: -// CHECK12-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK12-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK12-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] -// CHECK12-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK12: .omp.deinit: -// CHECK12-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK12-NEXT: br label [[DOTEXIT:%.*]] -// CHECK12: .exit: -// CHECK12-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK6-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK6-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK6-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK6-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK6-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK6-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK6: omp.precond.then: +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK6-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG10]] +// CHECK6-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK6-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK6-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK6: cond.true: +// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK6-NEXT: br label [[COND_END:%.*]] +// CHECK6: cond.false: +// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: br label [[COND_END]] +// CHECK6: cond.end: +// CHECK6-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK6-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK6: omp.inner.for.cond: +// CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK6-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK6-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK6: omp.inner.for.body: +// CHECK6-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 +// CHECK6-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK6-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP19:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK6-NEXT: store i8* [[TMP19]], i8** [[TMP18]], align 4 +// CHECK6-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK6-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 +// CHECK6-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK6-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK6-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 +// CHECK6-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK6-NEXT: [[TMP25:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* +// CHECK6-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 +// CHECK6-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK6-NEXT: [[TMP28:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP27]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP28]], i32 4) +// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK6: omp.inner.for.inc: +// CHECK6-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP29]], [[TMP30]] +// CHECK6-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] +// CHECK6-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK6-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK6-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] +// CHECK6-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] +// CHECK6: cond.true10: +// CHECK6-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK6-NEXT: br label [[COND_END12:%.*]] +// CHECK6: cond.false11: +// CHECK6-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: br label [[COND_END12]] +// CHECK6: cond.end12: +// CHECK6-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP37]], [[COND_TRUE10]] ], [ [[TMP38]], [[COND_FALSE11]] ] +// CHECK6-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP39]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP14:![0-9]+]] +// CHECK6: omp.inner.for.end: +// CHECK6-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK6: omp.loop.exit: +// CHECK6-NEXT: [[TMP40:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]]) +// CHECK6-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[TMP43:%.*]] = icmp ne i32 [[TMP42]], 0 +// CHECK6-NEXT: br i1 [[TMP43]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK6: .omp.final.then: +// CHECK6-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[SUB14:%.*]] = sub nsw i32 [[TMP44]], 0 +// CHECK6-NEXT: [[DIV15:%.*]] = sdiv i32 [[SUB14]], 1 +// CHECK6-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV15]], 1 +// CHECK6-NEXT: [[ADD16:%.*]] = add nsw i32 0, [[MUL]] +// CHECK6-NEXT: store i32 [[ADD16]], i32* [[I3]], align 4 +// CHECK6-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK6: .omp.final.done: +// CHECK6-NEXT: br label [[OMP_PRECOND_END]] +// CHECK6: omp.precond.end: +// CHECK6-NEXT: ret void // // -// CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__4 -// CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK12-NEXT: entry: -// CHECK12-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK12-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK12-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK12-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 -// CHECK12-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK12-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK12-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK12-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK12-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK12-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK12-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK12-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 -// CHECK12-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK12-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 -// CHECK12-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK12: cond.true: -// CHECK12-NEXT: br label [[COND_END:%.*]] -// CHECK12: cond.false: -// CHECK12-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: br label [[COND_END]] -// CHECK12: cond.end: -// CHECK12-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] -// CHECK12-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK12-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK12: omp.inner.for.cond: -// CHECK12-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 -// CHECK12-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK12: omp.inner.for.body: -// CHECK12-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK12-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK12-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP7]] to i8* -// CHECK12-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 -// CHECK12-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK12-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP8]] to i8* -// CHECK12-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 -// CHECK12-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK12-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* -// CHECK12-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 -// CHECK12-NEXT: [[TMP15:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK12-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP15]], i32 3) -// CHECK12-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK12: omp.inner.for.inc: -// CHECK12-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK12-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP17]] -// CHECK12-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK12-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK12-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] -// CHECK12-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK12-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK12-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] -// CHECK12-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP22]], 9 -// CHECK12-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] -// CHECK12: cond.true5: -// CHECK12-NEXT: br label [[COND_END7:%.*]] -// CHECK12: cond.false6: -// CHECK12-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: br label [[COND_END7]] -// CHECK12: cond.end7: -// CHECK12-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP23]], [[COND_FALSE6]] ] -// CHECK12-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK12-NEXT: store i32 [[TMP24]], i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP15:![0-9]+]] -// CHECK12: omp.inner.for.end: -// CHECK12-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK12: omp.loop.exit: -// CHECK12-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) -// CHECK12-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK12-NEXT: [[TMP26:%.*]] = icmp ne i32 [[TMP25]], 0 -// CHECK12-NEXT: br i1 [[TMP26]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK12: .omp.final.then: -// CHECK12-NEXT: store i32 10, i32* [[I]], align 4 -// CHECK12-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK12: .omp.final.done: -// CHECK12-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK6-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK6-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK6-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK6-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK6-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK6-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK6: omp.precond.then: +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK6-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK6: omp.inner.for.cond: +// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK6-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] +// CHECK6-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK6: omp.inner.for.body: +// CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK6-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK6-NEXT: [[TMP13:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK6-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i32 0, i32 [[TMP13]] +// CHECK6-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +// CHECK6-NEXT: [[CONV:%.*]] = sext i16 [[TMP14]] to i32 +// CHECK6-NEXT: [[ADD5:%.*]] = add nsw i32 [[CONV]], 1 +// CHECK6-NEXT: [[CONV6:%.*]] = trunc i32 [[ADD5]] to i16 +// CHECK6-NEXT: store i16 [[CONV6]], i16* [[ARRAYIDX]], align 2 +// CHECK6-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK6: omp.body.continue: +// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK6: omp.inner.for.inc: +// CHECK6-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] +// CHECK6-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP15:![0-9]+]] +// CHECK6: omp.inner.for.end: +// CHECK6-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK6: omp.loop.exit: +// CHECK6-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) +// CHECK6-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 +// CHECK6-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK6: .omp.final.then: +// CHECK6-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP21]], 0 +// CHECK6-NEXT: [[DIV9:%.*]] = sdiv i32 [[SUB8]], 1 +// CHECK6-NEXT: [[MUL10:%.*]] = mul nsw i32 [[DIV9]], 1 +// CHECK6-NEXT: [[ADD11:%.*]] = add nsw i32 0, [[MUL10]] +// CHECK6-NEXT: store i32 [[ADD11]], i32* [[I3]], align 4 +// CHECK6-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK6: .omp.final.done: +// CHECK6-NEXT: br label [[OMP_PRECOND_END]] +// CHECK6: omp.precond.end: +// CHECK6-NEXT: ret void // // -// CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__5 -// CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK12-NEXT: entry: -// CHECK12-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK12-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK12-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK12-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK12-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK12-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK12-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK12-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK12-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK12-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 -// CHECK12-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK12-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK12-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 -// CHECK12-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 -// CHECK12-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK12-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK12-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 -// CHECK12-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK12-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK12-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK12: omp.inner.for.cond: -// CHECK12-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK12-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] -// CHECK12-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK12: omp.inner.for.body: -// CHECK12-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 -// CHECK12-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK12-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK12-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 -// CHECK12-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 [[TMP9]] -// CHECK12-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -// CHECK12-NEXT: [[ADD1:%.*]] = add nsw i32 [[TMP10]], 1 -// CHECK12-NEXT: store i32 [[ADD1]], i32* [[ARRAYIDX]], align 4 -// CHECK12-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK12: omp.body.continue: -// CHECK12-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK12: omp.inner.for.inc: -// CHECK12-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK12-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] -// CHECK12-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]] -// CHECK12: omp.inner.for.end: -// CHECK12-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK12: omp.loop.exit: -// CHECK12-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) -// CHECK12-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK12-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 -// CHECK12-NEXT: br i1 [[TMP14]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK12: .omp.final.then: -// CHECK12-NEXT: store i32 10, i32* [[I]], align 4 -// CHECK12-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK12: .omp.final.done: -// CHECK12-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43 +// CHECK6-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK6-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG10]] +// CHECK6-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK6-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK6: .execute: +// CHECK6-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK6-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] +// CHECK6-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK6: .omp.deinit: +// CHECK6-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK6-NEXT: br label [[DOTEXIT:%.*]] +// CHECK6: .exit: +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK6-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG10]] +// CHECK6-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 +// CHECK6-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK6: cond.true: +// CHECK6-NEXT: br label [[COND_END:%.*]] +// CHECK6: cond.false: +// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: br label [[COND_END]] +// CHECK6: cond.end: +// CHECK6-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK6-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK6: omp.inner.for.cond: +// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 +// CHECK6-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK6: omp.inner.for.body: +// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK6-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK6-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK6-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK6-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK6-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK6-NEXT: [[TMP15:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP15]], i32 3) +// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK6: omp.inner.for.inc: +// CHECK6-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP17]] +// CHECK6-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] +// CHECK6-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK6-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP22]], 9 +// CHECK6-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] +// CHECK6: cond.true5: +// CHECK6-NEXT: br label [[COND_END7:%.*]] +// CHECK6: cond.false6: +// CHECK6-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: br label [[COND_END7]] +// CHECK6: cond.end7: +// CHECK6-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP23]], [[COND_FALSE6]] ] +// CHECK6-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP24]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]] +// CHECK6: omp.inner.for.end: +// CHECK6-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK6: omp.loop.exit: +// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK6-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[TMP26:%.*]] = icmp ne i32 [[TMP25]], 0 +// CHECK6-NEXT: br i1 [[TMP26]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK6: .omp.final.then: +// CHECK6-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK6-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK6: .omp.final.done: +// CHECK6-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__5 +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK6-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK6: omp.inner.for.cond: +// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK6-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK6-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK6: omp.inner.for.body: +// CHECK6-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 +// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK6-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 +// CHECK6-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 [[TMP9]] +// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK6-NEXT: [[ADD1:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK6-NEXT: store i32 [[ADD1]], i32* [[ARRAYIDX]], align 4 +// CHECK6-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK6: omp.body.continue: +// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK6: omp.inner.for.inc: +// CHECK6-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] +// CHECK6-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]] +// CHECK6: omp.inner.for.end: +// CHECK6-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK6: omp.loop.exit: +// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK6-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 +// CHECK6-NEXT: br i1 [[TMP14]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK6: .omp.final.then: +// CHECK6-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK6-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK6: .omp.final.done: +// CHECK6-NEXT: ret void // // -// CHECK12-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 -// CHECK12-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { -// CHECK12-NEXT: entry: -// CHECK12-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK12-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK12-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK12-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 -// CHECK12-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK12-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK12-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK12-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK12: .execute: -// CHECK12-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK12-NEXT: [[TMP2:%.*]] = load i32, i32* [[F_ADDR]], align 4 -// CHECK12-NEXT: store i32 [[TMP2]], i32* [[F_CASTED]], align 4 -// CHECK12-NEXT: [[TMP3:%.*]] = load i32, i32* [[F_CASTED]], align 4 -// CHECK12-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK12-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i32 [[TMP3]]) #[[ATTR2]] -// CHECK12-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK12: .omp.deinit: -// CHECK12-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK12-NEXT: br label [[DOTEXIT:%.*]] -// CHECK12: .exit: -// CHECK12-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l48 +// CHECK6-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK6-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK6-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG10]] +// CHECK6-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK6-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK6: .execute: +// CHECK6-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP2]], i32* [[F_CASTED]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK6-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i32 [[TMP3]]) #[[ATTR2]] +// CHECK6-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK6: .omp.deinit: +// CHECK6-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK6-NEXT: br label [[DOTEXIT:%.*]] +// CHECK6: .exit: +// CHECK6-NEXT: ret void // // -// CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__6 -// CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { -// CHECK12-NEXT: entry: -// CHECK12-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK12-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK12-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK12-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[K:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 -// CHECK12-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK12-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK12-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK12-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 -// CHECK12-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK12-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK12-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK12-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK12-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 -// CHECK12-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK12-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 -// CHECK12-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK12: cond.true: -// CHECK12-NEXT: br label [[COND_END:%.*]] -// CHECK12: cond.false: -// CHECK12-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: br label [[COND_END]] -// CHECK12: cond.end: -// CHECK12-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] -// CHECK12-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK12-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK12: omp.inner.for.cond: -// CHECK12-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 -// CHECK12-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK12: omp.inner.for.body: -// CHECK12-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK12-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: [[TMP9:%.*]] = load i32, i32* [[F_ADDR]], align 4 -// CHECK12-NEXT: store i32 [[TMP9]], i32* [[F_CASTED]], align 4 -// CHECK12-NEXT: [[TMP10:%.*]] = load i32, i32* [[F_CASTED]], align 4 -// CHECK12-NEXT: [[TMP11:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK12-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP7]] to i8* -// CHECK12-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 -// CHECK12-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK12-NEXT: [[TMP14:%.*]] = inttoptr i32 [[TMP8]] to i8* -// CHECK12-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 -// CHECK12-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK12-NEXT: [[TMP16:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* -// CHECK12-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 4 -// CHECK12-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK12-NEXT: [[TMP18:%.*]] = inttoptr i32 [[TMP10]] to i8* -// CHECK12-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 4 -// CHECK12-NEXT: [[TMP19:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK12-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x [10 x i32]]*, i32)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP19]], i32 4) -// CHECK12-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK12: omp.inner.for.inc: -// CHECK12-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK12-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] -// CHECK12-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK12-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK12-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK12-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK12-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK12-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] -// CHECK12-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP26]], 99 -// CHECK12-NEXT: br i1 [[CMP5]], label [[COND_TRUE6:%.*]], label [[COND_FALSE7:%.*]] -// CHECK12: cond.true6: -// CHECK12-NEXT: br label [[COND_END8:%.*]] -// CHECK12: cond.false7: -// CHECK12-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: br label [[COND_END8]] -// CHECK12: cond.end8: -// CHECK12-NEXT: [[COND9:%.*]] = phi i32 [ 99, [[COND_TRUE6]] ], [ [[TMP27]], [[COND_FALSE7]] ] -// CHECK12-NEXT: store i32 [[COND9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK12-NEXT: store i32 [[TMP28]], i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]] -// CHECK12: omp.inner.for.end: -// CHECK12-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK12: omp.loop.exit: -// CHECK12-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) -// CHECK12-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK12-NEXT: [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0 -// CHECK12-NEXT: br i1 [[TMP30]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK12: .omp.final.then: -// CHECK12-NEXT: store i32 10, i32* [[I]], align 4 -// CHECK12-NEXT: store i32 10, i32* [[J]], align 4 -// CHECK12-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK12: .omp.final.done: -// CHECK12-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__6 +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK6-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK6-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG10]] +// CHECK6-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 +// CHECK6-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK6: cond.true: +// CHECK6-NEXT: br label [[COND_END:%.*]] +// CHECK6: cond.false: +// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: br label [[COND_END]] +// CHECK6: cond.end: +// CHECK6-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK6-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK6: omp.inner.for.cond: +// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 +// CHECK6-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK6: omp.inner.for.body: +// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP9]], i32* [[F_CASTED]], align 4 +// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK6-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK6-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP14:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK6-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK6-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK6-NEXT: [[TMP16:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK6-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 4 +// CHECK6-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK6-NEXT: [[TMP18:%.*]] = inttoptr i32 [[TMP10]] to i8* +// CHECK6-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 4 +// CHECK6-NEXT: [[TMP19:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x [10 x i32]]*, i32)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP19]], i32 4) +// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK6: omp.inner.for.inc: +// CHECK6-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK6-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK6-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK6-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP26]], 99 +// CHECK6-NEXT: br i1 [[CMP5]], label [[COND_TRUE6:%.*]], label [[COND_FALSE7:%.*]] +// CHECK6: cond.true6: +// CHECK6-NEXT: br label [[COND_END8:%.*]] +// CHECK6: cond.false7: +// CHECK6-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: br label [[COND_END8]] +// CHECK6: cond.end8: +// CHECK6-NEXT: [[COND9:%.*]] = phi i32 [ 99, [[COND_TRUE6]] ], [ [[TMP27]], [[COND_FALSE7]] ] +// CHECK6-NEXT: store i32 [[COND9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP28]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP18:![0-9]+]] +// CHECK6: omp.inner.for.end: +// CHECK6-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK6: omp.loop.exit: +// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK6-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0 +// CHECK6-NEXT: br i1 [[TMP30]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK6: .omp.final.then: +// CHECK6-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK6-NEXT: store i32 10, i32* [[J]], align 4 +// CHECK6-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK6: .omp.final.done: +// CHECK6-NEXT: ret void // // -// CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__7 -// CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { -// CHECK12-NEXT: entry: -// CHECK12-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK12-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK12-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK12-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[K:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK12-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK12-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK12-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK12-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK12-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 -// CHECK12-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK12-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 -// CHECK12-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK12-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK12-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 -// CHECK12-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 -// CHECK12-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK12-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK12-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 -// CHECK12-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK12-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK12-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK12: omp.inner.for.cond: -// CHECK12-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK12-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] -// CHECK12-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK12: omp.inner.for.body: -// CHECK12-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 -// CHECK12-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 -// CHECK12-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK12-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK12-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: [[DIV2:%.*]] = sdiv i32 [[TMP10]], 10 -// CHECK12-NEXT: [[MUL3:%.*]] = mul nsw i32 [[DIV2]], 10 -// CHECK12-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL3]] -// CHECK12-NEXT: [[MUL4:%.*]] = mul nsw i32 [[SUB]], 1 -// CHECK12-NEXT: [[ADD5:%.*]] = add nsw i32 0, [[MUL4]] -// CHECK12-NEXT: store i32 [[ADD5]], i32* [[J]], align 4 -// CHECK12-NEXT: store i32 10, i32* [[K]], align 4 -// CHECK12-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 -// CHECK12-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 -// CHECK12-NEXT: [[TMP13:%.*]] = load i32, i32* [[F_ADDR]], align 4 -// CHECK12-NEXT: [[MUL6:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] -// CHECK12-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP11]], [[MUL6]] -// CHECK12-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 -// CHECK12-NEXT: [[ADD8:%.*]] = add nsw i32 [[ADD7]], [[TMP14]] -// CHECK12-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 -// CHECK12-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP15]] -// CHECK12-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 -// CHECK12-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP16]] -// CHECK12-NEXT: store i32 [[ADD8]], i32* [[ARRAYIDX9]], align 4 -// CHECK12-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK12: omp.body.continue: -// CHECK12-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK12: omp.inner.for.inc: -// CHECK12-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK12-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] -// CHECK12-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP18:![0-9]+]] -// CHECK12: omp.inner.for.end: -// CHECK12-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK12: omp.loop.exit: -// CHECK12-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) -// CHECK12-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK12-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 -// CHECK12-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK12: .omp.final.then: -// CHECK12-NEXT: store i32 10, i32* [[I]], align 4 -// CHECK12-NEXT: store i32 10, i32* [[J]], align 4 -// CHECK12-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK12: .omp.final.done: -// CHECK12-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__7 +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK6-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK6-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK6: omp.inner.for.cond: +// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK6-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK6-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK6: omp.inner.for.body: +// CHECK6-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 +// CHECK6-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 +// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK6-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[DIV2:%.*]] = sdiv i32 [[TMP10]], 10 +// CHECK6-NEXT: [[MUL3:%.*]] = mul nsw i32 [[DIV2]], 10 +// CHECK6-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL3]] +// CHECK6-NEXT: [[MUL4:%.*]] = mul nsw i32 [[SUB]], 1 +// CHECK6-NEXT: [[ADD5:%.*]] = add nsw i32 0, [[MUL4]] +// CHECK6-NEXT: store i32 [[ADD5]], i32* [[J]], align 4 +// CHECK6-NEXT: store i32 10, i32* [[K]], align 4 +// CHECK6-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 +// CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 +// CHECK6-NEXT: [[TMP13:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK6-NEXT: [[MUL6:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] +// CHECK6-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP11]], [[MUL6]] +// CHECK6-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 +// CHECK6-NEXT: [[ADD8:%.*]] = add nsw i32 [[ADD7]], [[TMP14]] +// CHECK6-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 +// CHECK6-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP15]] +// CHECK6-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 +// CHECK6-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP16]] +// CHECK6-NEXT: store i32 [[ADD8]], i32* [[ARRAYIDX9]], align 4 +// CHECK6-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK6: omp.body.continue: +// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK6: omp.inner.for.inc: +// CHECK6-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK6-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]] +// CHECK6: omp.inner.for.end: +// CHECK6-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK6: omp.loop.exit: +// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK6-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 +// CHECK6-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK6: .omp.final.then: +// CHECK6-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK6-NEXT: store i32 10, i32* [[J]], align 4 +// CHECK6-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK6: .omp.final.done: +// CHECK6-NEXT: ret void // diff --git a/clang/test/OpenMP/nvptx_teams_codegen.cpp b/clang/test/OpenMP/nvptx_teams_codegen.cpp --- a/clang/test/OpenMP/nvptx_teams_codegen.cpp +++ b/clang/test/OpenMP/nvptx_teams_codegen.cpp @@ -1,10 +1,8 @@ // Test target codegen - host bc file has to be created first. // RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-64 --check-prefix SEQ -// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CK1 --check-prefix CK1-64 --check-prefix PAR +// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-64 // RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-32 --check-prefix SEQ -// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CK1 --check-prefix CK1-32 --check-prefix PAR +// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-32 // expected-no-diagnostics #ifndef HEADER #define HEADER @@ -29,31 +27,18 @@ return tmain(argv); } -// SEQ: [[MEM_TY:%.+]] = type { [128 x i8] } -// SEQ-DAG: [[SHARED_GLOBAL_RD:@.+]] = weak addrspace(3) global [[MEM_TY]] undef -// SEQ-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* undef -// SEQ-DAG: [[KERNEL_SIZE1:@.+]] = internal unnamed_addr constant i{{64|32}} 4 -// SEQ-DAG: [[KERNEL_SIZE2:@.+]] = internal unnamed_addr constant i{{64|32}} {{8|4}} -// SEQ-DAG: [[KERNEL_SHARED1:@.+]] = internal unnamed_addr constant i16 1 -// SEQ-DAG: [[KERNEL_SHARED2:@.+]] = internal unnamed_addr constant i16 1 - // only nvptx side: do not outline teams region and do not call fork_teams // CK1: define {{.*}}void @{{[^,]+}}(i{{[0-9]+}} [[ARGC:%.+]]) // CK1: [[ARGCADDR:%.+]] = alloca i{{[0-9]+}}, // CK1: store {{.+}} 0, {{.+}}, -// CK1: store i{{[0-9]+}} [[ARGC]], i{{[0-9]+}}* [[ARGCADDR]], -// CK1-64: [[CONV:%.+]] = bitcast i{{[0-9]+}}* [[ARGCADDR]] to i{{[0-9]+}}* -// SEQ: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED1]], -// SEQ: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE1]], -// SEQ: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[IS_SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) -// SEQ: [[KERNEL_RD:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]], -// SEQ: [[GLOBALSTACK:%.+]] = getelementptr inbounds i8, i8* [[KERNEL_RD]], i{{64|32}} 0 -// PAR: [[GLOBALSTACK:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{32|64}} 4, i16 1) -// CK1-64: [[ARG:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[CONV]] -// CK1-32: [[ARG:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[ARGCADDR]] -// CK1: [[ARGCADDR:%.+]] = getelementptr inbounds %struct.{{.*}}, %struct.{{.*}}* %{{.*}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0 -// CK1: store i{{[0-9]+}} [[ARG]], i{{[0-9]+}}* [[ARGCADDR]], -// CK1: call void [[OUTLINED:@.+]](i32* %{{.+}}, i32* %{{.+}}, i32* [[ARGCADDR]]) +// CK1: store i{{32|64}} [[ARGC]], i{{32|64}}* [[ARGCADDR]], +// CK1-64: [[CONV:%.+]] = bitcast i{{32|64}}* [[ARGCADDR]] to i{{32|64}}* +// CK1-64: [[ARG:%.+]] = load i{{32|64}}, i{{32|64}}* [[CONV]] +// CK1-32: [[ARG:%.+]] = load i{{32|64}}, i{{32|64}}* [[ARGCADDR]] +// CK1: [[GLOBALSTACK:%.+]] = call i8* @__kmpc_alloc_shared(i{{32|64}} 4) +// CK1: [[ARGC_ON_STACK:%.+]] = bitcast i8* [[GLOBALSTACK]] to i{{32|64}}* +// CK1: store i{{32|64}} [[ARG]], i{{32|64}}* [[ARGC_ON_STACK]], +// CK1: call void [[OUTLINED:@.+]](i32* %{{.+}}, i32* %{{.+}}, i32* [[ARGC_ON_STACK]]) // CK1: ret void // CK1-NEXT: } @@ -65,16 +50,11 @@ // CK1: define {{.*}}void @{{[^,]+}}(i{{.+}}** [[ARGC:%.+]]) // CK1: [[ARGCADDR:%.+]] = alloca i{{.+}}**, // CK1: store i{{.+}}** [[ARGC]], i{{.+}}*** [[ARGCADDR]] -// SEQ: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED2]], -// SEQ: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE2]], -// SEQ: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[IS_SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) -// SEQ: [[KERNEL_RD:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]], -// SEQ: [[GLOBALSTACK:%.+]] = getelementptr inbounds i8, i8* [[KERNEL_RD]], i{{64|32}} 0 -// PAR: [[GLOBALSTACK:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{32|64}} {{4|8}}, i16 1) // CK1: [[ARG:%.+]] = load i{{[0-9]+}}**, i{{[0-9]+}}*** [[ARGCADDR]] -// CK1: [[ARGCADDR:%.+]] = getelementptr inbounds %struct.{{.*}}, %struct.{{.*}}* %{{.*}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0 -// CK1: store i{{[0-9]+}}** [[ARG]], i{{[0-9]+}}*** [[ARGCADDR]], -// CK1: call void [[OUTLINED:@.+]](i32* %{{.+}}, i32* %{{.+}}, i8*** [[ARGCADDR]]) +// CK1: [[GLOBALSTACK:%.+]] = call i8* @__kmpc_alloc_shared(i{{32|64}} {{4|8}}) +// CK1: [[ARGC_ON_STACK:%.+]] = bitcast i8* [[GLOBALSTACK]] to i{{[0-9]+}}* +// CK1: store i{{[0-9]+}}** [[ARG]], i{{[0-9]+}}*** [[ARGC_ON_STACK]], +// CK1: call void [[OUTLINED:@.+]](i32* %{{.+}}, i32* %{{.+}}, i8*** [[ARGC_ON_STACK]]) // CK1: ret void // CK1-NEXT: } @@ -87,11 +67,9 @@ // Test target codegen - host bc file has to be created first. // RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-64 --check-prefix SEQ2 -// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CK2 --check-prefix CK2-64 --check-prefix PAR2 +// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-64 // RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-32 --check-prefix SEQ2 -// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CK2 --check-prefix CK2-32 --check-prefix PAR2 +// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-32 // expected-no-diagnostics #ifdef CK2 @@ -118,14 +96,6 @@ return tmain(argv); } -// SEQ2: [[MEM_TY:%.+]] = type { [128 x i8] } -// SEQ2-DAG: [[SHARED_GLOBAL_RD:@.+]] = weak addrspace(3) global [[MEM_TY]] undef -// SEQ2-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* undef -// SEQ2-DAG: [[KERNEL_SIZE1:@.+]] = internal unnamed_addr constant i{{64|32}} 4 -// SEQ2-DAG: [[KERNEL_SIZE2:@.+]] = internal unnamed_addr constant i{{64|32}} {{8|4}} -// SEQ2-DAG: [[KERNEL_SHARED1:@.+]] = internal unnamed_addr constant i16 1 -// SEQ2-DAG: [[KERNEL_SHARED2:@.+]] = internal unnamed_addr constant i16 1 - // CK2: define {{.*}}void @{{[^,]+}}(i{{[0-9]+}} [[A_IN:%.+]], i{{[0-9]+}} [[B_IN:%.+]], i{{[0-9]+}} [[ARGC_IN:.+]]) // CK2: [[AADDR:%.+]] = alloca i{{[0-9]+}}, // CK2: [[BADDR:%.+]] = alloca i{{[0-9]+}}, @@ -136,18 +106,13 @@ // CK2-64: [[ACONV:%.+]] = bitcast i64* [[AADDR]] to i32* // CK2-64: [[BCONV:%.+]] = bitcast i64* [[BADDR]] to i32* // CK2-64: [[CONV:%.+]] = bitcast i64* [[ARGCADDR]] to i32* -// SEQ2: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED1]], -// SEQ2: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE1]], -// SEQ2: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[IS_SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) -// SEQ2: [[KERNEL_RD:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]], -// SEQ2: [[GLOBALSTACK:%.+]] = getelementptr inbounds i8, i8* [[KERNEL_RD]], i{{64|32}} 0 -// PAR2: [[GLOBALSTACK:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{32|64}} 4, i16 1) // CK2-64: [[ARG:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[CONV]] // CK2-32: [[ARG:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[ARGCADDR]] -// CK2: [[ARGCADDR:%.+]] = getelementptr inbounds %struct.{{.*}}, %struct.{{.*}}* %{{.*}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0 -// CK2: store i{{[0-9]+}} [[ARG]], i{{[0-9]+}}* [[ARGCADDR]], +// CK2: [[GLOBALSTACK:%.+]] = call i8* @__kmpc_alloc_shared(i{{32|64}} 4) +// CK2: [[ARGC_ON_STACK:%.+]] = bitcast i8* [[GLOBALSTACK]] to i{{32|64}}* +// CK2: store i{{[0-9]+}} [[ARG]], i{{[0-9]+}}* [[ARGC_ON_STACK]], // CK2: {{%.+}} = call i32 @__kmpc_global_thread_num( -// CK2: call void [[OUTLINED:@.+]](i32* %{{.+}}, i32* %{{.+}}, i32* [[ARGCADDR]]) +// CK2: call void [[OUTLINED:@.+]](i32* %{{.+}}, i32* %{{.+}}, i32* [[ARGC_ON_STACK]]) // CK2: ret // CK2: define internal void [[OUTLINED]]( @@ -162,17 +127,12 @@ // CK2: store i{{[0-9]+}} [[A_IN]], i{{[0-9]+}}* [[AADDR]], // CK2: store i{{[0-9]+}} [[B_IN]], i{{[0-9]+}}* [[BADDR]], // CK2: store i{{[0-9]+}}** [[ARGC]], i{{[0-9]+}}*** [[ARGCADDR]], -// SEQ2: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED2]], -// SEQ2: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE2]], -// SEQ2: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[IS_SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) -// SEQ2: [[KERNEL_RD:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]], -// SEQ2: [[GLOBALSTACK:%.+]] = getelementptr inbounds i8, i8* [[KERNEL_RD]], i{{64|32}} 0 -// PAR2: [[GLOBALSTACK:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{32|64}} {{4|8}}, i16 1) // CK2: [[ARG:%.+]] = load i{{[0-9]+}}**, i{{[0-9]+}}*** [[ARGCADDR]] -// CK2: [[ARGCADDR:%.+]] = getelementptr inbounds %struct.{{.*}}, %struct.{{.*}}* %{{.*}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0 -// CK2: store i{{[0-9]+}}** [[ARG]], i{{[0-9]+}}*** [[ARGCADDR]], +// CK2: [[GLOBALSTACK:%.+]] = call i8* @__kmpc_alloc_shared(i{{32|64}} {{4|8}}) +// CK2: [[ARGC_ON_STACK:%.+]] = bitcast i8* [[GLOBALSTACK]] to i{{[0-9]+}}* +// CK2: store i{{[0-9]+}}** [[ARG]], i{{[0-9]+}}*** [[ARGC_ON_STACK]], // CK2: {{%.+}} = call i32 @__kmpc_global_thread_num( -// CK2: call void [[OUTLINED:@.+]](i32* %{{.+}}, i32* %{{.+}}, i8*** [[ARGCADDR]]) +// CK2: call void [[OUTLINED:@.+]](i32* %{{.+}}, i32* %{{.+}}, i8*** [[ARGC_ON_STACK]]) // CK2: ret void // CK2: define internal void [[OUTLINED]]( diff --git a/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp b/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp --- a/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp +++ b/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp @@ -2,12 +2,10 @@ // Test target codegen - host bc file has to be created first. // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK1 -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK2 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK3 -// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -fopenmp-cuda-teams-reduction-recs-num=2048 -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK4 -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK5 -// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -fopenmp-cuda-teams-reduction-recs-num=2048 -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK6 +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK2 +// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -fopenmp-cuda-teams-reduction-recs-num=2048 -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK3 + // expected-no-diagnostics #ifndef HEADER #define HEADER @@ -53,4354 +51,6 @@ } #endif -// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_worker -// CHECK1-SAME: () #[[ATTR0:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 -// CHECK1-NEXT: store i8* null, i8** [[WORK_FN]], align 8 -// CHECK1-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 -// CHECK1-NEXT: br label [[DOTAWAIT_WORK:%.*]] -// CHECK1: .await.work: -// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK1-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) -// CHECK1-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 -// CHECK1-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 -// CHECK1-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 -// CHECK1-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null -// CHECK1-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] -// CHECK1: .select.workers: -// CHECK1-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 -// CHECK1-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK1-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] -// CHECK1: .execute.parallel: -// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) -// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* -// CHECK1-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) -// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] -// CHECK1: .terminate.parallel: -// CHECK1-NEXT: call void @__kmpc_kernel_end_parallel() -// CHECK1-NEXT: br label [[DOTBARRIER_PARALLEL]] -// CHECK1: .barrier.parallel: -// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK1-NEXT: br label [[DOTAWAIT_WORK]] -// CHECK1: .exit: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23 -// CHECK1-SAME: (i64 [[E:%.*]]) #[[ATTR1:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[E_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK1-NEXT: store i64 [[E]], i64* [[E_ADDR]], align 8 -// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[E_ADDR]] to double* -// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK1-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] -// CHECK1-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] -// CHECK1-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] -// CHECK1: .worker: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_worker() #[[ATTR3:[0-9]+]] -// CHECK1-NEXT: br label [[DOTEXIT:%.*]] -// CHECK1: .mastercheck: -// CHECK1-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK1-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK1-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 -// CHECK1-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 -// CHECK1-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 -// CHECK1-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] -// CHECK1-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] -// CHECK1-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] -// CHECK1: .master: -// CHECK1-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK1-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] -// CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK1-NEXT: [[TMP5:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK1-NEXT: [[TMP6:%.*]] = load i64, i64* @"_openmp_static_kernel$size", align 8 -// CHECK1-NEXT: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i64 [[TMP6]], i16 [[TMP5]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) -// CHECK1-NEXT: [[TMP7:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 8 -// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP7]], i64 0 -// CHECK1-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to %struct._globalized_locals_ty* -// CHECK1-NEXT: [[TMP10:%.*]] = load double, double* [[CONV]], align 8 -// CHECK1-NEXT: [[E7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP9]], i32 0, i32 0 -// CHECK1-NEXT: store double [[TMP10]], double* [[E7]], align 8 -// CHECK1-NEXT: [[TMP11:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK1-NEXT: store i32 [[TMP11]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK1-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], double* [[E7]]) #[[ATTR3]] -// CHECK1-NEXT: [[TMP12:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK1-NEXT: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP12]]) -// CHECK1-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] -// CHECK1: .termination.notifier: -// CHECK1-NEXT: call void @__kmpc_kernel_deinit(i16 1) -// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK1-NEXT: br label [[DOTEXIT]] -// CHECK1: .exit: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[E_ADDR:%.*]] = alloca double*, align 8 -// CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 8 -// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: store double* [[E]], double** [[E_ADDR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 8 -// CHECK1-NEXT: [[TMP1:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 8 -// CHECK1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 8 -// CHECK1-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to %struct._globalized_locals_ty.0* -// CHECK1-NEXT: [[E1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP3]], i32 0, i32 0 -// CHECK1-NEXT: store double 0.000000e+00, double* [[E1]], align 8 -// CHECK1-NEXT: [[TMP4:%.*]] = load double, double* [[E1]], align 8 -// CHECK1-NEXT: [[ADD:%.*]] = fadd double [[TMP4]], 5.000000e+00 -// CHECK1-NEXT: store double [[ADD]], double* [[E1]], align 8 -// CHECK1-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 -// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP8:%.*]] = bitcast double* [[E1]] to i8* -// CHECK1-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 8 -// CHECK1-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK1-NEXT: [[TMP10:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 8 -// CHECK1-NEXT: [[TMP11:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i8* [[TMP10]], i32 1024, i8* [[TMP9]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func) -// CHECK1-NEXT: [[TMP12:%.*]] = icmp eq i32 [[TMP11]], 1 -// CHECK1-NEXT: br i1 [[TMP12]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] -// CHECK1: .omp.reduction.then: -// CHECK1-NEXT: [[TMP13:%.*]] = load double, double* [[TMP0]], align 8 -// CHECK1-NEXT: [[TMP14:%.*]] = load double, double* [[E1]], align 8 -// CHECK1-NEXT: [[ADD2:%.*]] = fadd double [[TMP13]], [[TMP14]] -// CHECK1-NEXT: store double [[ADD2]], double* [[TMP0]], align 8 -// CHECK1-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP6]]) -// CHECK1-NEXT: br label [[DOTOMP_REDUCTION_DONE]] -// CHECK1: .omp.reduction.done: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func -// CHECK1-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 -// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 -// CHECK1-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 -// CHECK1-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x i8*], align 8 -// CHECK1-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca double, align 8 -// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 -// CHECK1-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 -// CHECK1-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 -// CHECK1-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [1 x i8*]* -// CHECK1-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 -// CHECK1-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 -// CHECK1-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 -// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 8 -// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to double* -// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr double, double* [[TMP12]], i64 1 -// CHECK1-NEXT: [[TMP14:%.*]] = bitcast double* [[TMP13]] to i8* -// CHECK1-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP12]] to i64* -// CHECK1-NEXT: [[TMP16:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i64* -// CHECK1-NEXT: [[TMP17:%.*]] = load i64, i64* [[TMP15]], align 8 -// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK1-NEXT: [[TMP18:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 -// CHECK1-NEXT: [[TMP19:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP17]], i16 [[TMP7]], i16 [[TMP18]]) -// CHECK1-NEXT: store i64 [[TMP19]], i64* [[TMP16]], align 8 -// CHECK1-NEXT: [[TMP20:%.*]] = getelementptr i64, i64* [[TMP15]], i64 1 -// CHECK1-NEXT: [[TMP21:%.*]] = getelementptr i64, i64* [[TMP16]], i64 1 -// CHECK1-NEXT: [[TMP22:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i8* -// CHECK1-NEXT: store i8* [[TMP22]], i8** [[TMP11]], align 8 -// CHECK1-NEXT: [[TMP23:%.*]] = icmp eq i16 [[TMP8]], 0 -// CHECK1-NEXT: [[TMP24:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK1-NEXT: [[TMP25:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] -// CHECK1-NEXT: [[TMP26:%.*]] = and i1 [[TMP24]], [[TMP25]] -// CHECK1-NEXT: [[TMP27:%.*]] = icmp eq i16 [[TMP8]], 2 -// CHECK1-NEXT: [[TMP28:%.*]] = and i16 [[TMP6]], 1 -// CHECK1-NEXT: [[TMP29:%.*]] = icmp eq i16 [[TMP28]], 0 -// CHECK1-NEXT: [[TMP30:%.*]] = and i1 [[TMP27]], [[TMP29]] -// CHECK1-NEXT: [[TMP31:%.*]] = icmp sgt i16 [[TMP7]], 0 -// CHECK1-NEXT: [[TMP32:%.*]] = and i1 [[TMP30]], [[TMP31]] -// CHECK1-NEXT: [[TMP33:%.*]] = or i1 [[TMP23]], [[TMP26]] -// CHECK1-NEXT: [[TMP34:%.*]] = or i1 [[TMP33]], [[TMP32]] -// CHECK1-NEXT: br i1 [[TMP34]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK1: then: -// CHECK1-NEXT: [[TMP35:%.*]] = bitcast [1 x i8*]* [[TMP5]] to i8* -// CHECK1-NEXT: [[TMP36:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* -// CHECK1-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP35]], i8* [[TMP36]]) #[[ATTR3]] -// CHECK1-NEXT: br label [[IFCONT:%.*]] -// CHECK1: else: -// CHECK1-NEXT: br label [[IFCONT]] -// CHECK1: ifcont: -// CHECK1-NEXT: [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK1-NEXT: [[TMP38:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] -// CHECK1-NEXT: [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]] -// CHECK1-NEXT: br i1 [[TMP39]], label [[THEN4:%.*]], label [[ELSE5:%.*]] -// CHECK1: then4: -// CHECK1-NEXT: [[TMP40:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP41:%.*]] = load i8*, i8** [[TMP40]], align 8 -// CHECK1-NEXT: [[TMP42:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP43:%.*]] = load i8*, i8** [[TMP42]], align 8 -// CHECK1-NEXT: [[TMP44:%.*]] = bitcast i8* [[TMP41]] to double* -// CHECK1-NEXT: [[TMP45:%.*]] = bitcast i8* [[TMP43]] to double* -// CHECK1-NEXT: [[TMP46:%.*]] = load double, double* [[TMP44]], align 8 -// CHECK1-NEXT: store double [[TMP46]], double* [[TMP45]], align 8 -// CHECK1-NEXT: br label [[IFCONT6:%.*]] -// CHECK1: else5: -// CHECK1-NEXT: br label [[IFCONT6]] -// CHECK1: ifcont6: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func -// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTCNT_ADDR:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK1-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK1-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 -// CHECK1-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK1-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 -// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* -// CHECK1-NEXT: store i32 0, i32* [[DOTCNT_ADDR]], align 4 -// CHECK1-NEXT: br label [[PRECOND:%.*]] -// CHECK1: precond: -// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCNT_ADDR]], align 4 -// CHECK1-NEXT: [[TMP6:%.*]] = icmp ult i32 [[TMP5]], 2 -// CHECK1-NEXT: br i1 [[TMP6]], label [[BODY:%.*]], label [[EXIT:%.*]] -// CHECK1: body: -// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP2]]) -// CHECK1-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK1-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK1: then: -// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP8:%.*]] = load i8*, i8** [[TMP7]], align 8 -// CHECK1-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32* -// CHECK1-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[TMP9]], i32 [[TMP5]] -// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 -// CHECK1-NEXT: store volatile i32 [[TMP12]], i32 addrspace(3)* [[TMP11]], align 4 -// CHECK1-NEXT: br label [[IFCONT:%.*]] -// CHECK1: else: -// CHECK1-NEXT: br label [[IFCONT]] -// CHECK1: ifcont: -// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) -// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP13]] -// CHECK1-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] -// CHECK1: then4: -// CHECK1-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 8 -// CHECK1-NEXT: [[TMP17:%.*]] = bitcast i8* [[TMP16]] to i32* -// CHECK1-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP17]], i32 [[TMP5]] -// CHECK1-NEXT: [[TMP19:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4 -// CHECK1-NEXT: store i32 [[TMP19]], i32* [[TMP18]], align 4 -// CHECK1-NEXT: br label [[IFCONT6:%.*]] -// CHECK1: else5: -// CHECK1-NEXT: br label [[IFCONT6]] -// CHECK1: ifcont6: -// CHECK1-NEXT: [[TMP20:%.*]] = add nsw i32 [[TMP5]], 1 -// CHECK1-NEXT: store i32 [[TMP20]], i32* [[DOTCNT_ADDR]], align 4 -// CHECK1-NEXT: br label [[PRECOND]] -// CHECK1: exit: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func -// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 -// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 -// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* -// CHECK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.1* -// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8 -// CHECK1-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double* -// CHECK1-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 0 -// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP7]] -// CHECK1-NEXT: [[TMP12:%.*]] = load double, double* [[TMP10]], align 8 -// CHECK1-NEXT: store double [[TMP12]], double* [[TMP11]], align 128 -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func -// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 8 -// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 -// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.1* -// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 -// CHECK1-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP4]], i32 0, i32 0 -// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP5]] -// CHECK1-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8* -// CHECK1-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 8 -// CHECK1-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK1-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 -// CHECK1-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP9]], i8* [[TMP10]]) #[[ATTR3]] -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func -// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 -// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 -// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* -// CHECK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.1* -// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8 -// CHECK1-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double* -// CHECK1-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 0 -// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP7]] -// CHECK1-NEXT: [[TMP12:%.*]] = load double, double* [[TMP11]], align 128 -// CHECK1-NEXT: store double [[TMP12]], double* [[TMP10]], align 8 -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func -// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 8 -// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 -// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.1* -// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 -// CHECK1-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP4]], i32 0, i32 0 -// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP5]] -// CHECK1-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8* -// CHECK1-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 8 -// CHECK1-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK1-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 -// CHECK1-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP10]], i8* [[TMP9]]) #[[ATTR3]] -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29_worker -// CHECK1-SAME: () #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 -// CHECK1-NEXT: store i8* null, i8** [[WORK_FN]], align 8 -// CHECK1-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 -// CHECK1-NEXT: br label [[DOTAWAIT_WORK:%.*]] -// CHECK1: .await.work: -// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK1-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) -// CHECK1-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 -// CHECK1-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 -// CHECK1-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 -// CHECK1-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null -// CHECK1-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] -// CHECK1: .select.workers: -// CHECK1-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 -// CHECK1-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK1-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] -// CHECK1: .execute.parallel: -// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* -// CHECK1-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) -// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] -// CHECK1: .terminate.parallel: -// CHECK1-NEXT: call void @__kmpc_kernel_end_parallel() -// CHECK1-NEXT: br label [[DOTBARRIER_PARALLEL]] -// CHECK1: .barrier.parallel: -// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK1-NEXT: br label [[DOTAWAIT_WORK]] -// CHECK1: .exit: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29 -// CHECK1-SAME: (i64 [[C:%.*]], i64 [[D:%.*]]) #[[ATTR1]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[D_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK1-NEXT: store i64 [[C]], i64* [[C_ADDR]], align 8 -// CHECK1-NEXT: store i64 [[D]], i64* [[D_ADDR]], align 8 -// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[C_ADDR]] to i8* -// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[D_ADDR]] to float* -// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK1-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] -// CHECK1-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] -// CHECK1-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] -// CHECK1: .worker: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29_worker() #[[ATTR3]] -// CHECK1-NEXT: br label [[DOTEXIT:%.*]] -// CHECK1: .mastercheck: -// CHECK1-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK1-NEXT: [[NVPTX_NUM_THREADS3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: [[NVPTX_WARP_SIZE4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK1-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE4]], 1 -// CHECK1-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS3]], 1 -// CHECK1-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 -// CHECK1-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] -// CHECK1-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID2]], [[MASTER_TID]] -// CHECK1-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] -// CHECK1: .master: -// CHECK1-NEXT: [[NVPTX_NUM_THREADS5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: [[NVPTX_WARP_SIZE6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK1-NEXT: [[THREAD_LIMIT7:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS5]], [[NVPTX_WARP_SIZE6]] -// CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT7]], i16 1) -// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK1-NEXT: [[TMP5:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared1", align 2 -// CHECK1-NEXT: [[TMP6:%.*]] = load i64, i64* @"_openmp_static_kernel$size2", align 8 -// CHECK1-NEXT: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i64 [[TMP6]], i16 [[TMP5]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) -// CHECK1-NEXT: [[TMP7:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 8 -// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP7]], i64 0 -// CHECK1-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to %struct._globalized_locals_ty.2* -// CHECK1-NEXT: [[TMP10:%.*]] = load i8, i8* [[CONV]], align 8 -// CHECK1-NEXT: [[C8:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], %struct._globalized_locals_ty.2* [[TMP9]], i32 0, i32 1 -// CHECK1-NEXT: store i8 [[TMP10]], i8* [[C8]], align 4 -// CHECK1-NEXT: [[TMP11:%.*]] = load float, float* [[CONV1]], align 8 -// CHECK1-NEXT: [[D9:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], %struct._globalized_locals_ty.2* [[TMP9]], i32 0, i32 0 -// CHECK1-NEXT: store float [[TMP11]], float* [[D9]], align 4 -// CHECK1-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK1-NEXT: store i32 [[TMP12]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK1-NEXT: call void @__omp_outlined__3(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8* [[C8]], float* [[D9]]) #[[ATTR3]] -// CHECK1-NEXT: [[TMP13:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared1", align 2 -// CHECK1-NEXT: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP13]]) -// CHECK1-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] -// CHECK1: .termination.notifier: -// CHECK1-NEXT: call void @__kmpc_kernel_deinit(i16 1) -// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK1-NEXT: br label [[DOTEXIT]] -// CHECK1: .exit: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__3 -// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8* nonnull align 1 dereferenceable(1) [[C:%.*]], float* nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: [[D_ADDR:%.*]] = alloca float*, align 8 -// CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8 -// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: store i8* [[C]], i8** [[C_ADDR]], align 8 -// CHECK1-NEXT: store float* [[D]], float** [[D_ADDR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load i8*, i8** [[C_ADDR]], align 8 -// CHECK1-NEXT: [[TMP1:%.*]] = load float*, float** [[D_ADDR]], align 8 -// CHECK1-NEXT: [[TMP2:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 8 -// CHECK1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i64 8 -// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.3* -// CHECK1-NEXT: [[C1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 1 -// CHECK1-NEXT: [[D2:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 0 -// CHECK1-NEXT: store i8 0, i8* [[C1]], align 4 -// CHECK1-NEXT: store float 1.000000e+00, float* [[D2]], align 4 -// CHECK1-NEXT: [[TMP5:%.*]] = load i8, i8* [[C1]], align 4 -// CHECK1-NEXT: [[CONV:%.*]] = sext i8 [[TMP5]] to i32 -// CHECK1-NEXT: [[XOR:%.*]] = xor i32 [[CONV]], 2 -// CHECK1-NEXT: [[CONV3:%.*]] = trunc i32 [[XOR]] to i8 -// CHECK1-NEXT: store i8 [[CONV3]], i8* [[C1]], align 4 -// CHECK1-NEXT: [[TMP6:%.*]] = load float, float* [[D2]], align 4 -// CHECK1-NEXT: [[MUL:%.*]] = fmul float [[TMP6]], 3.300000e+01 -// CHECK1-NEXT: store float [[MUL]], float* [[D2]], align 4 -// CHECK1-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 -// CHECK1-NEXT: store i8* [[C1]], i8** [[TMP9]], align 8 -// CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP11:%.*]] = bitcast float* [[D2]] to i8* -// CHECK1-NEXT: store i8* [[TMP11]], i8** [[TMP10]], align 8 -// CHECK1-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK1-NEXT: [[TMP13:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 8 -// CHECK1-NEXT: [[TMP14:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP8]], i8* [[TMP13]], i32 1024, i8* [[TMP12]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func5, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func6, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func7, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func8, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func9, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func10) -// CHECK1-NEXT: [[TMP15:%.*]] = icmp eq i32 [[TMP14]], 1 -// CHECK1-NEXT: br i1 [[TMP15]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] -// CHECK1: .omp.reduction.then: -// CHECK1-NEXT: [[TMP16:%.*]] = load i8, i8* [[TMP0]], align 1 -// CHECK1-NEXT: [[CONV4:%.*]] = sext i8 [[TMP16]] to i32 -// CHECK1-NEXT: [[TMP17:%.*]] = load i8, i8* [[C1]], align 4 -// CHECK1-NEXT: [[CONV5:%.*]] = sext i8 [[TMP17]] to i32 -// CHECK1-NEXT: [[XOR6:%.*]] = xor i32 [[CONV4]], [[CONV5]] -// CHECK1-NEXT: [[CONV7:%.*]] = trunc i32 [[XOR6]] to i8 -// CHECK1-NEXT: store i8 [[CONV7]], i8* [[TMP0]], align 1 -// CHECK1-NEXT: [[TMP18:%.*]] = load float, float* [[TMP1]], align 4 -// CHECK1-NEXT: [[TMP19:%.*]] = load float, float* [[D2]], align 4 -// CHECK1-NEXT: [[MUL8:%.*]] = fmul float [[TMP18]], [[TMP19]] -// CHECK1-NEXT: store float [[MUL8]], float* [[TMP1]], align 4 -// CHECK1-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP8]]) -// CHECK1-NEXT: br label [[DOTOMP_REDUCTION_DONE]] -// CHECK1: .omp.reduction.done: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func5 -// CHECK1-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 -// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 -// CHECK1-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 -// CHECK1-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 8 -// CHECK1-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i8, align 1 -// CHECK1-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca float, align 4 -// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 -// CHECK1-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 -// CHECK1-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 -// CHECK1-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* -// CHECK1-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 -// CHECK1-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 -// CHECK1-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 -// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 8 -// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[TMP10]], i64 1 -// CHECK1-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP10]], align 1 -// CHECK1-NEXT: [[TMP14:%.*]] = sext i8 [[TMP13]] to i32 -// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK1-NEXT: [[TMP15:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 -// CHECK1-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP14]], i16 [[TMP7]], i16 [[TMP15]]) -// CHECK1-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8 -// CHECK1-NEXT: store i8 [[TMP17]], i8* [[DOTOMP_REDUCTION_ELEMENT]], align 1 -// CHECK1-NEXT: [[TMP18:%.*]] = getelementptr i8, i8* [[TMP10]], i64 1 -// CHECK1-NEXT: [[TMP19:%.*]] = getelementptr i8, i8* [[DOTOMP_REDUCTION_ELEMENT]], i64 1 -// CHECK1-NEXT: store i8* [[DOTOMP_REDUCTION_ELEMENT]], i8** [[TMP11]], align 8 -// CHECK1-NEXT: [[TMP20:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP21:%.*]] = load i8*, i8** [[TMP20]], align 8 -// CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP23:%.*]] = bitcast i8* [[TMP21]] to float* -// CHECK1-NEXT: [[TMP24:%.*]] = getelementptr float, float* [[TMP23]], i64 1 -// CHECK1-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to i8* -// CHECK1-NEXT: [[TMP26:%.*]] = bitcast float* [[TMP23]] to i32* -// CHECK1-NEXT: [[TMP27:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i32* -// CHECK1-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP26]], align 4 -// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK1-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 -// CHECK1-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) -// CHECK1-NEXT: store i32 [[TMP30]], i32* [[TMP27]], align 4 -// CHECK1-NEXT: [[TMP31:%.*]] = getelementptr i32, i32* [[TMP26]], i64 1 -// CHECK1-NEXT: [[TMP32:%.*]] = getelementptr i32, i32* [[TMP27]], i64 1 -// CHECK1-NEXT: [[TMP33:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* -// CHECK1-NEXT: store i8* [[TMP33]], i8** [[TMP22]], align 8 -// CHECK1-NEXT: [[TMP34:%.*]] = icmp eq i16 [[TMP8]], 0 -// CHECK1-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK1-NEXT: [[TMP36:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] -// CHECK1-NEXT: [[TMP37:%.*]] = and i1 [[TMP35]], [[TMP36]] -// CHECK1-NEXT: [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 2 -// CHECK1-NEXT: [[TMP39:%.*]] = and i16 [[TMP6]], 1 -// CHECK1-NEXT: [[TMP40:%.*]] = icmp eq i16 [[TMP39]], 0 -// CHECK1-NEXT: [[TMP41:%.*]] = and i1 [[TMP38]], [[TMP40]] -// CHECK1-NEXT: [[TMP42:%.*]] = icmp sgt i16 [[TMP7]], 0 -// CHECK1-NEXT: [[TMP43:%.*]] = and i1 [[TMP41]], [[TMP42]] -// CHECK1-NEXT: [[TMP44:%.*]] = or i1 [[TMP34]], [[TMP37]] -// CHECK1-NEXT: [[TMP45:%.*]] = or i1 [[TMP44]], [[TMP43]] -// CHECK1-NEXT: br i1 [[TMP45]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK1: then: -// CHECK1-NEXT: [[TMP46:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* -// CHECK1-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* -// CHECK1-NEXT: call void @"_omp$reduction$reduction_func4"(i8* [[TMP46]], i8* [[TMP47]]) #[[ATTR3]] -// CHECK1-NEXT: br label [[IFCONT:%.*]] -// CHECK1: else: -// CHECK1-NEXT: br label [[IFCONT]] -// CHECK1: ifcont: -// CHECK1-NEXT: [[TMP48:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK1-NEXT: [[TMP49:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] -// CHECK1-NEXT: [[TMP50:%.*]] = and i1 [[TMP48]], [[TMP49]] -// CHECK1-NEXT: br i1 [[TMP50]], label [[THEN6:%.*]], label [[ELSE7:%.*]] -// CHECK1: then6: -// CHECK1-NEXT: [[TMP51:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP52:%.*]] = load i8*, i8** [[TMP51]], align 8 -// CHECK1-NEXT: [[TMP53:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP54:%.*]] = load i8*, i8** [[TMP53]], align 8 -// CHECK1-NEXT: [[TMP55:%.*]] = load i8, i8* [[TMP52]], align 1 -// CHECK1-NEXT: store i8 [[TMP55]], i8* [[TMP54]], align 1 -// CHECK1-NEXT: [[TMP56:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP57:%.*]] = load i8*, i8** [[TMP56]], align 8 -// CHECK1-NEXT: [[TMP58:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP59:%.*]] = load i8*, i8** [[TMP58]], align 8 -// CHECK1-NEXT: [[TMP60:%.*]] = bitcast i8* [[TMP57]] to float* -// CHECK1-NEXT: [[TMP61:%.*]] = bitcast i8* [[TMP59]] to float* -// CHECK1-NEXT: [[TMP62:%.*]] = load float, float* [[TMP60]], align 4 -// CHECK1-NEXT: store float [[TMP62]], float* [[TMP61]], align 4 -// CHECK1-NEXT: br label [[IFCONT8:%.*]] -// CHECK1: else7: -// CHECK1-NEXT: br label [[IFCONT8]] -// CHECK1: ifcont8: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func6 -// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK1-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK1-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 -// CHECK1-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK1-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 -// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) -// CHECK1-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK1-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK1: then: -// CHECK1-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 8 -// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK1-NEXT: [[TMP8:%.*]] = bitcast i32 addrspace(3)* [[TMP7]] to i8 addrspace(3)* -// CHECK1-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP6]], align 1 -// CHECK1-NEXT: store volatile i8 [[TMP9]], i8 addrspace(3)* [[TMP8]], align 1 -// CHECK1-NEXT: br label [[IFCONT:%.*]] -// CHECK1: else: -// CHECK1-NEXT: br label [[IFCONT]] -// CHECK1: ifcont: -// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) -// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] -// CHECK1-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] -// CHECK1: then4: -// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK1-NEXT: [[TMP12:%.*]] = bitcast i32 addrspace(3)* [[TMP11]] to i8 addrspace(3)* -// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 8 -// CHECK1-NEXT: [[TMP15:%.*]] = load volatile i8, i8 addrspace(3)* [[TMP12]], align 1 -// CHECK1-NEXT: store i8 [[TMP15]], i8* [[TMP14]], align 1 -// CHECK1-NEXT: br label [[IFCONT6:%.*]] -// CHECK1: else5: -// CHECK1-NEXT: br label [[IFCONT6]] -// CHECK1: ifcont6: -// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) -// CHECK1-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK1-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] -// CHECK1: then8: -// CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 8 -// CHECK1-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i32* -// CHECK1-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK1-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP18]], align 4 -// CHECK1-NEXT: store volatile i32 [[TMP20]], i32 addrspace(3)* [[TMP19]], align 4 -// CHECK1-NEXT: br label [[IFCONT10:%.*]] -// CHECK1: else9: -// CHECK1-NEXT: br label [[IFCONT10]] -// CHECK1: ifcont10: -// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) -// CHECK1-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP21]] -// CHECK1-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] -// CHECK1: then12: -// CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP24:%.*]] = load i8*, i8** [[TMP23]], align 8 -// CHECK1-NEXT: [[TMP25:%.*]] = bitcast i8* [[TMP24]] to i32* -// CHECK1-NEXT: [[TMP26:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP22]], align 4 -// CHECK1-NEXT: store i32 [[TMP26]], i32* [[TMP25]], align 4 -// CHECK1-NEXT: br label [[IFCONT14:%.*]] -// CHECK1: else13: -// CHECK1-NEXT: br label [[IFCONT14]] -// CHECK1: ifcont14: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func7 -// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 -// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 -// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.4* -// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8 -// CHECK1-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 0 -// CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP7]] -// CHECK1-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]], align 1 -// CHECK1-NEXT: store i8 [[TMP11]], i8* [[TMP10]], align 128 -// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 8 -// CHECK1-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float* -// CHECK1-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 1 -// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP7]] -// CHECK1-NEXT: [[TMP16:%.*]] = load float, float* [[TMP14]], align 4 -// CHECK1-NEXT: store float [[TMP16]], float* [[TMP15]], align 128 -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func8 -// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8 -// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 -// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.4* -// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 -// CHECK1-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 0 -// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP5]] -// CHECK1-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 8 -// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1 -// CHECK1-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 1 -// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP5]] -// CHECK1-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8* -// CHECK1-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 8 -// CHECK1-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK1-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 -// CHECK1-NEXT: call void @"_omp$reduction$reduction_func4"(i8* [[TMP11]], i8* [[TMP12]]) #[[ATTR3]] -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func9 -// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 -// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 -// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.4* -// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8 -// CHECK1-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 0 -// CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP7]] -// CHECK1-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP10]], align 128 -// CHECK1-NEXT: store i8 [[TMP11]], i8* [[TMP9]], align 1 -// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 8 -// CHECK1-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float* -// CHECK1-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 1 -// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP7]] -// CHECK1-NEXT: [[TMP16:%.*]] = load float, float* [[TMP15]], align 128 -// CHECK1-NEXT: store float [[TMP16]], float* [[TMP14]], align 4 -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func10 -// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8 -// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 -// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.4* -// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 -// CHECK1-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 0 -// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP5]] -// CHECK1-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 8 -// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1 -// CHECK1-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 1 -// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP5]] -// CHECK1-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8* -// CHECK1-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 8 -// CHECK1-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK1-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 -// CHECK1-NEXT: call void @"_omp$reduction$reduction_func4"(i8* [[TMP12]], i8* [[TMP11]]) #[[ATTR3]] -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l36 -// CHECK1-SAME: (i64 [[A:%.*]], i64 [[B:%.*]]) #[[ATTR1]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK1-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 -// CHECK1-NEXT: store i64 [[B]], i64* [[B_ADDR]], align 8 -// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32* -// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[B_ADDR]] to i16* -// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() -// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK1: .execute: -// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) -// CHECK1-NEXT: store i32 [[TMP0]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK1-NEXT: call void @__omp_outlined__11(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[CONV]], i16* [[CONV1]]) #[[ATTR3]] -// CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK1: .omp.deinit: -// CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) -// CHECK1-NEXT: br label [[DOTEXIT:%.*]] -// CHECK1: .exit: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__11 -// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 8 -// CHECK1-NEXT: [[A1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[B2:%.*]] = alloca i16, align 2 -// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 8 -// CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8 -// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 -// CHECK1-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 -// CHECK1-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 8 -// CHECK1-NEXT: store i32 0, i32* [[A1]], align 4 -// CHECK1-NEXT: store i16 -32768, i16* [[B2]], align 2 -// CHECK1-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP3:%.*]] = bitcast i32* [[A1]] to i8* -// CHECK1-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 8 -// CHECK1-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i16* [[B2]] to i8* -// CHECK1-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 8 -// CHECK1-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 -// CHECK1-NEXT: [[TMP8:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*)* @__omp_outlined__12 to i8*), i8* null, i8** [[TMP8]], i64 2) -// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP10:%.*]] = bitcast i32* [[A1]] to i8* -// CHECK1-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 8 -// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP12:%.*]] = bitcast i16* [[B2]] to i8* -// CHECK1-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 8 -// CHECK1-NEXT: [[TMP13:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK1-NEXT: [[TMP14:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 8 -// CHECK1-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i8* [[TMP14]], i32 1024, i8* [[TMP13]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func17, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func18, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func19, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func20, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func21, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func22) -// CHECK1-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP15]], 1 -// CHECK1-NEXT: br i1 [[TMP16]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] -// CHECK1: .omp.reduction.then: -// CHECK1-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP0]], align 4 -// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[A1]], align 4 -// CHECK1-NEXT: [[OR:%.*]] = or i32 [[TMP17]], [[TMP18]] -// CHECK1-NEXT: store i32 [[OR]], i32* [[TMP0]], align 4 -// CHECK1-NEXT: [[TMP19:%.*]] = load i16, i16* [[TMP1]], align 2 -// CHECK1-NEXT: [[CONV:%.*]] = sext i16 [[TMP19]] to i32 -// CHECK1-NEXT: [[TMP20:%.*]] = load i16, i16* [[B2]], align 2 -// CHECK1-NEXT: [[CONV3:%.*]] = sext i16 [[TMP20]] to i32 -// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[CONV]], [[CONV3]] -// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK1: cond.true: -// CHECK1-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP1]], align 2 -// CHECK1-NEXT: br label [[COND_END:%.*]] -// CHECK1: cond.false: -// CHECK1-NEXT: [[TMP22:%.*]] = load i16, i16* [[B2]], align 2 -// CHECK1-NEXT: br label [[COND_END]] -// CHECK1: cond.end: -// CHECK1-NEXT: [[COND:%.*]] = phi i16 [ [[TMP21]], [[COND_TRUE]] ], [ [[TMP22]], [[COND_FALSE]] ] -// CHECK1-NEXT: store i16 [[COND]], i16* [[TMP1]], align 2 -// CHECK1-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP7]]) -// CHECK1-NEXT: br label [[DOTOMP_REDUCTION_DONE]] -// CHECK1: .omp.reduction.done: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__12 -// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 8 -// CHECK1-NEXT: [[A1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[B2:%.*]] = alloca i16, align 2 -// CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8 -// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 -// CHECK1-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 -// CHECK1-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 8 -// CHECK1-NEXT: store i32 0, i32* [[A1]], align 4 -// CHECK1-NEXT: store i16 -32768, i16* [[B2]], align 2 -// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[A1]], align 4 -// CHECK1-NEXT: [[OR:%.*]] = or i32 [[TMP2]], 1 -// CHECK1-NEXT: store i32 [[OR]], i32* [[A1]], align 4 -// CHECK1-NEXT: [[TMP3:%.*]] = load i16, i16* [[B2]], align 2 -// CHECK1-NEXT: [[CONV:%.*]] = sext i16 [[TMP3]] to i32 -// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 99, [[CONV]] -// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK1: cond.true: -// CHECK1-NEXT: br label [[COND_END:%.*]] -// CHECK1: cond.false: -// CHECK1-NEXT: [[TMP4:%.*]] = load i16, i16* [[B2]], align 2 -// CHECK1-NEXT: [[CONV3:%.*]] = sext i16 [[TMP4]] to i32 -// CHECK1-NEXT: br label [[COND_END]] -// CHECK1: cond.end: -// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[CONV3]], [[COND_FALSE]] ] -// CHECK1-NEXT: [[CONV4:%.*]] = trunc i32 [[COND]] to i16 -// CHECK1-NEXT: store i16 [[CONV4]], i16* [[B2]], align 2 -// CHECK1-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 -// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP8:%.*]] = bitcast i32* [[A1]] to i8* -// CHECK1-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 8 -// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP10:%.*]] = bitcast i16* [[B2]] to i8* -// CHECK1-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 8 -// CHECK1-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK1-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP6]], i32 2, i64 16, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func14, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func15) -// CHECK1-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP12]], 1 -// CHECK1-NEXT: br i1 [[TMP13]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] -// CHECK1: .omp.reduction.then: -// CHECK1-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP0]], align 4 -// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[A1]], align 4 -// CHECK1-NEXT: [[OR5:%.*]] = or i32 [[TMP14]], [[TMP15]] -// CHECK1-NEXT: store i32 [[OR5]], i32* [[TMP0]], align 4 -// CHECK1-NEXT: [[TMP16:%.*]] = load i16, i16* [[TMP1]], align 2 -// CHECK1-NEXT: [[CONV6:%.*]] = sext i16 [[TMP16]] to i32 -// CHECK1-NEXT: [[TMP17:%.*]] = load i16, i16* [[B2]], align 2 -// CHECK1-NEXT: [[CONV7:%.*]] = sext i16 [[TMP17]] to i32 -// CHECK1-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[CONV6]], [[CONV7]] -// CHECK1-NEXT: br i1 [[CMP8]], label [[COND_TRUE9:%.*]], label [[COND_FALSE10:%.*]] -// CHECK1: cond.true9: -// CHECK1-NEXT: [[TMP18:%.*]] = load i16, i16* [[TMP1]], align 2 -// CHECK1-NEXT: br label [[COND_END11:%.*]] -// CHECK1: cond.false10: -// CHECK1-NEXT: [[TMP19:%.*]] = load i16, i16* [[B2]], align 2 -// CHECK1-NEXT: br label [[COND_END11]] -// CHECK1: cond.end11: -// CHECK1-NEXT: [[COND12:%.*]] = phi i16 [ [[TMP18]], [[COND_TRUE9]] ], [ [[TMP19]], [[COND_FALSE10]] ] -// CHECK1-NEXT: store i16 [[COND12]], i16* [[TMP1]], align 2 -// CHECK1-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP6]]) -// CHECK1-NEXT: br label [[DOTOMP_REDUCTION_DONE]] -// CHECK1: .omp.reduction.done: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func14 -// CHECK1-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 -// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 -// CHECK1-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 -// CHECK1-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 8 -// CHECK1-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 -// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 -// CHECK1-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 -// CHECK1-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 -// CHECK1-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* -// CHECK1-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 -// CHECK1-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 -// CHECK1-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 -// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 8 -// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* -// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i64 1 -// CHECK1-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* -// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 -// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK1-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 -// CHECK1-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]]) -// CHECK1-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 -// CHECK1-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i64 1 -// CHECK1-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i64 1 -// CHECK1-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* -// CHECK1-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 8 -// CHECK1-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 8 -// CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16* -// CHECK1-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i64 1 -// CHECK1-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8* -// CHECK1-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2 -// CHECK1-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32 -// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK1-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 -// CHECK1-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) -// CHECK1-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 -// CHECK1-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 -// CHECK1-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i64 1 -// CHECK1-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i64 1 -// CHECK1-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* -// CHECK1-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 8 -// CHECK1-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0 -// CHECK1-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK1-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] -// CHECK1-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]] -// CHECK1-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2 -// CHECK1-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1 -// CHECK1-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0 -// CHECK1-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]] -// CHECK1-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0 -// CHECK1-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]] -// CHECK1-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]] -// CHECK1-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]] -// CHECK1-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK1: then: -// CHECK1-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* -// CHECK1-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* -// CHECK1-NEXT: call void @"_omp$reduction$reduction_func13"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]] -// CHECK1-NEXT: br label [[IFCONT:%.*]] -// CHECK1: else: -// CHECK1-NEXT: br label [[IFCONT]] -// CHECK1: ifcont: -// CHECK1-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK1-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] -// CHECK1-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]] -// CHECK1-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]] -// CHECK1: then6: -// CHECK1-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 8 -// CHECK1-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 8 -// CHECK1-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32* -// CHECK1-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32* -// CHECK1-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4 -// CHECK1-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4 -// CHECK1-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 8 -// CHECK1-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 8 -// CHECK1-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16* -// CHECK1-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16* -// CHECK1-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2 -// CHECK1-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2 -// CHECK1-NEXT: br label [[IFCONT8:%.*]] -// CHECK1: else7: -// CHECK1-NEXT: br label [[IFCONT8]] -// CHECK1: ifcont8: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func15 -// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK1-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK1-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 -// CHECK1-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK1-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 -// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4:[0-9]+]], i32 [[TMP2]]) -// CHECK1-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK1-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK1: then: -// CHECK1-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 8 -// CHECK1-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* -// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK1-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4 -// CHECK1-NEXT: br label [[IFCONT:%.*]] -// CHECK1: else: -// CHECK1-NEXT: br label [[IFCONT]] -// CHECK1: ifcont: -// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] -// CHECK1-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] -// CHECK1: then4: -// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 8 -// CHECK1-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32* -// CHECK1-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4 -// CHECK1-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4 -// CHECK1-NEXT: br label [[IFCONT6:%.*]] -// CHECK1: else5: -// CHECK1-NEXT: br label [[IFCONT6]] -// CHECK1: ifcont6: -// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK1-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK1-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] -// CHECK1: then8: -// CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 8 -// CHECK1-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16* -// CHECK1-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK1-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)* -// CHECK1-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2 -// CHECK1-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2 -// CHECK1-NEXT: br label [[IFCONT10:%.*]] -// CHECK1: else9: -// CHECK1-NEXT: br label [[IFCONT10]] -// CHECK1: ifcont10: -// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK1-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]] -// CHECK1-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] -// CHECK1: then12: -// CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK1-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)* -// CHECK1-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 8 -// CHECK1-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16* -// CHECK1-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2 -// CHECK1-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2 -// CHECK1-NEXT: br label [[IFCONT14:%.*]] -// CHECK1: else13: -// CHECK1-NEXT: br label [[IFCONT14]] -// CHECK1: ifcont14: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func17 -// CHECK1-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 -// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 -// CHECK1-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 -// CHECK1-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 8 -// CHECK1-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 -// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 -// CHECK1-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 -// CHECK1-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 -// CHECK1-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* -// CHECK1-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 -// CHECK1-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 -// CHECK1-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 -// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 8 -// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* -// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i64 1 -// CHECK1-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* -// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 -// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK1-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 -// CHECK1-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]]) -// CHECK1-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 -// CHECK1-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i64 1 -// CHECK1-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i64 1 -// CHECK1-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* -// CHECK1-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 8 -// CHECK1-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 8 -// CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16* -// CHECK1-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i64 1 -// CHECK1-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8* -// CHECK1-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2 -// CHECK1-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32 -// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK1-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 -// CHECK1-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) -// CHECK1-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 -// CHECK1-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 -// CHECK1-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i64 1 -// CHECK1-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i64 1 -// CHECK1-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* -// CHECK1-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 8 -// CHECK1-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0 -// CHECK1-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK1-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] -// CHECK1-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]] -// CHECK1-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2 -// CHECK1-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1 -// CHECK1-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0 -// CHECK1-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]] -// CHECK1-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0 -// CHECK1-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]] -// CHECK1-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]] -// CHECK1-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]] -// CHECK1-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK1: then: -// CHECK1-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* -// CHECK1-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* -// CHECK1-NEXT: call void @"_omp$reduction$reduction_func16"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]] -// CHECK1-NEXT: br label [[IFCONT:%.*]] -// CHECK1: else: -// CHECK1-NEXT: br label [[IFCONT]] -// CHECK1: ifcont: -// CHECK1-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK1-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] -// CHECK1-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]] -// CHECK1-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]] -// CHECK1: then6: -// CHECK1-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 8 -// CHECK1-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 8 -// CHECK1-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32* -// CHECK1-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32* -// CHECK1-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4 -// CHECK1-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4 -// CHECK1-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 8 -// CHECK1-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 8 -// CHECK1-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16* -// CHECK1-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16* -// CHECK1-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2 -// CHECK1-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2 -// CHECK1-NEXT: br label [[IFCONT8:%.*]] -// CHECK1: else7: -// CHECK1-NEXT: br label [[IFCONT8]] -// CHECK1: ifcont8: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func18 -// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK1-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK1-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 -// CHECK1-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK1-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 -// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK1-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK1-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK1: then: -// CHECK1-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 8 -// CHECK1-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* -// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK1-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4 -// CHECK1-NEXT: br label [[IFCONT:%.*]] -// CHECK1: else: -// CHECK1-NEXT: br label [[IFCONT]] -// CHECK1: ifcont: -// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] -// CHECK1-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] -// CHECK1: then4: -// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 8 -// CHECK1-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32* -// CHECK1-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4 -// CHECK1-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4 -// CHECK1-NEXT: br label [[IFCONT6:%.*]] -// CHECK1: else5: -// CHECK1-NEXT: br label [[IFCONT6]] -// CHECK1: ifcont6: -// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK1-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK1-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] -// CHECK1: then8: -// CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 8 -// CHECK1-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16* -// CHECK1-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK1-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)* -// CHECK1-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2 -// CHECK1-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2 -// CHECK1-NEXT: br label [[IFCONT10:%.*]] -// CHECK1: else9: -// CHECK1-NEXT: br label [[IFCONT10]] -// CHECK1: ifcont10: -// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK1-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]] -// CHECK1-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] -// CHECK1: then12: -// CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK1-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)* -// CHECK1-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 8 -// CHECK1-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16* -// CHECK1-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2 -// CHECK1-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2 -// CHECK1-NEXT: br label [[IFCONT14:%.*]] -// CHECK1: else13: -// CHECK1-NEXT: br label [[IFCONT14]] -// CHECK1: ifcont14: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func19 -// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 -// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 -// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.5* -// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8 -// CHECK1-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* -// CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_5:%.*]], %struct._globalized_locals_ty.5* [[TMP6]], i32 0, i32 0 -// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP7]] -// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 -// CHECK1-NEXT: store i32 [[TMP12]], i32* [[TMP11]], align 128 -// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 8 -// CHECK1-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16* -// CHECK1-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_5]], %struct._globalized_locals_ty.5* [[TMP6]], i32 0, i32 1 -// CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP7]] -// CHECK1-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP15]], align 2 -// CHECK1-NEXT: store i16 [[TMP17]], i16* [[TMP16]], align 128 -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func20 -// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8 -// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 -// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.5* -// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 -// CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_5:%.*]], %struct._globalized_locals_ty.5* [[TMP4]], i32 0, i32 0 -// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP5]] -// CHECK1-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* -// CHECK1-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 8 -// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1 -// CHECK1-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_5]], %struct._globalized_locals_ty.5* [[TMP4]], i32 0, i32 1 -// CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP5]] -// CHECK1-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8* -// CHECK1-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 8 -// CHECK1-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK1-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 -// CHECK1-NEXT: call void @"_omp$reduction$reduction_func16"(i8* [[TMP12]], i8* [[TMP13]]) #[[ATTR3]] -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func21 -// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 -// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 -// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.5* -// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8 -// CHECK1-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* -// CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_5:%.*]], %struct._globalized_locals_ty.5* [[TMP6]], i32 0, i32 0 -// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP7]] -// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 128 -// CHECK1-NEXT: store i32 [[TMP12]], i32* [[TMP10]], align 4 -// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 8 -// CHECK1-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16* -// CHECK1-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_5]], %struct._globalized_locals_ty.5* [[TMP6]], i32 0, i32 1 -// CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP7]] -// CHECK1-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP16]], align 128 -// CHECK1-NEXT: store i16 [[TMP17]], i16* [[TMP15]], align 2 -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func22 -// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8 -// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 -// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.5* -// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 -// CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_5:%.*]], %struct._globalized_locals_ty.5* [[TMP4]], i32 0, i32 0 -// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP5]] -// CHECK1-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* -// CHECK1-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 8 -// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1 -// CHECK1-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_5]], %struct._globalized_locals_ty.5* [[TMP4]], i32 0, i32 1 -// CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP5]] -// CHECK1-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8* -// CHECK1-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 8 -// CHECK1-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK1-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 -// CHECK1-NEXT: call void @"_omp$reduction$reduction_func16"(i8* [[TMP13]], i8* [[TMP12]]) #[[ATTR3]] -// CHECK1-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_worker -// CHECK2-SAME: () #[[ATTR0:[0-9]+]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 -// CHECK2-NEXT: store i8* null, i8** [[WORK_FN]], align 8 -// CHECK2-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 -// CHECK2-NEXT: br label [[DOTAWAIT_WORK:%.*]] -// CHECK2: .await.work: -// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK2-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) -// CHECK2-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 -// CHECK2-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 -// CHECK2-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 -// CHECK2-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null -// CHECK2-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] -// CHECK2: .select.workers: -// CHECK2-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 -// CHECK2-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK2-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] -// CHECK2: .execute.parallel: -// CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) -// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* -// CHECK2-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) -// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] -// CHECK2: .terminate.parallel: -// CHECK2-NEXT: call void @__kmpc_kernel_end_parallel() -// CHECK2-NEXT: br label [[DOTBARRIER_PARALLEL]] -// CHECK2: .barrier.parallel: -// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK2-NEXT: br label [[DOTAWAIT_WORK]] -// CHECK2: .exit: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23 -// CHECK2-SAME: (i64 [[E:%.*]]) #[[ATTR1:[0-9]+]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[E_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK2-NEXT: store i64 [[E]], i64* [[E_ADDR]], align 8 -// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[E_ADDR]] to double* -// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK2-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] -// CHECK2-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] -// CHECK2-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] -// CHECK2: .worker: -// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_worker() #[[ATTR3:[0-9]+]] -// CHECK2-NEXT: br label [[DOTEXIT:%.*]] -// CHECK2: .mastercheck: -// CHECK2-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK2-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK2-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 -// CHECK2-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 -// CHECK2-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 -// CHECK2-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] -// CHECK2-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] -// CHECK2-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] -// CHECK2: .master: -// CHECK2-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] -// CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK2-NEXT: [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i64 8, i16 1) -// CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty* -// CHECK2-NEXT: [[TMP7:%.*]] = load double, double* [[CONV]], align 8 -// CHECK2-NEXT: [[E7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP6]], i32 0, i32 0 -// CHECK2-NEXT: store double [[TMP7]], double* [[E7]], align 8 -// CHECK2-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK2-NEXT: store i32 [[TMP8]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK2-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], double* [[E7]]) #[[ATTR3]] -// CHECK2-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]]) -// CHECK2-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] -// CHECK2: .termination.notifier: -// CHECK2-NEXT: call void @__kmpc_kernel_deinit(i16 1) -// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK2-NEXT: br label [[DOTEXIT]] -// CHECK2: .exit: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[E_ADDR:%.*]] = alloca double*, align 8 -// CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 8 -// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: store double* [[E]], double** [[E_ADDR]], align 8 -// CHECK2-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 8 -// CHECK2-NEXT: [[TMP1:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i64 8, i16 1) -// CHECK2-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to %struct._globalized_locals_ty.0* -// CHECK2-NEXT: [[E1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP2]], i32 0, i32 0 -// CHECK2-NEXT: store double 0.000000e+00, double* [[E1]], align 8 -// CHECK2-NEXT: [[TMP3:%.*]] = load double, double* [[E1]], align 8 -// CHECK2-NEXT: [[ADD:%.*]] = fadd double [[TMP3]], 5.000000e+00 -// CHECK2-NEXT: store double [[ADD]], double* [[E1]], align 8 -// CHECK2-NEXT: [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4 -// CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP7:%.*]] = bitcast double* [[E1]] to i8* -// CHECK2-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 8 -// CHECK2-NEXT: [[TMP8:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK2-NEXT: [[TMP9:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 8 -// CHECK2-NEXT: [[TMP10:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i8* [[TMP9]], i32 1024, i8* [[TMP8]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func) -// CHECK2-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP10]], 1 -// CHECK2-NEXT: br i1 [[TMP11]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] -// CHECK2: .omp.reduction.then: -// CHECK2-NEXT: [[TMP12:%.*]] = load double, double* [[TMP0]], align 8 -// CHECK2-NEXT: [[TMP13:%.*]] = load double, double* [[E1]], align 8 -// CHECK2-NEXT: [[ADD2:%.*]] = fadd double [[TMP12]], [[TMP13]] -// CHECK2-NEXT: store double [[ADD2]], double* [[TMP0]], align 8 -// CHECK2-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]]) -// CHECK2-NEXT: br label [[DOTOMP_REDUCTION_DONE]] -// CHECK2: .omp.reduction.done: -// CHECK2-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP1]]) -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func -// CHECK2-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 -// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 -// CHECK2-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 -// CHECK2-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x i8*], align 8 -// CHECK2-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca double, align 8 -// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 -// CHECK2-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 -// CHECK2-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 -// CHECK2-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [1 x i8*]* -// CHECK2-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 -// CHECK2-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 -// CHECK2-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 -// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 8 -// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to double* -// CHECK2-NEXT: [[TMP13:%.*]] = getelementptr double, double* [[TMP12]], i64 1 -// CHECK2-NEXT: [[TMP14:%.*]] = bitcast double* [[TMP13]] to i8* -// CHECK2-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP12]] to i64* -// CHECK2-NEXT: [[TMP16:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i64* -// CHECK2-NEXT: [[TMP17:%.*]] = load i64, i64* [[TMP15]], align 8 -// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK2-NEXT: [[TMP18:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 -// CHECK2-NEXT: [[TMP19:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP17]], i16 [[TMP7]], i16 [[TMP18]]) -// CHECK2-NEXT: store i64 [[TMP19]], i64* [[TMP16]], align 8 -// CHECK2-NEXT: [[TMP20:%.*]] = getelementptr i64, i64* [[TMP15]], i64 1 -// CHECK2-NEXT: [[TMP21:%.*]] = getelementptr i64, i64* [[TMP16]], i64 1 -// CHECK2-NEXT: [[TMP22:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i8* -// CHECK2-NEXT: store i8* [[TMP22]], i8** [[TMP11]], align 8 -// CHECK2-NEXT: [[TMP23:%.*]] = icmp eq i16 [[TMP8]], 0 -// CHECK2-NEXT: [[TMP24:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK2-NEXT: [[TMP25:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] -// CHECK2-NEXT: [[TMP26:%.*]] = and i1 [[TMP24]], [[TMP25]] -// CHECK2-NEXT: [[TMP27:%.*]] = icmp eq i16 [[TMP8]], 2 -// CHECK2-NEXT: [[TMP28:%.*]] = and i16 [[TMP6]], 1 -// CHECK2-NEXT: [[TMP29:%.*]] = icmp eq i16 [[TMP28]], 0 -// CHECK2-NEXT: [[TMP30:%.*]] = and i1 [[TMP27]], [[TMP29]] -// CHECK2-NEXT: [[TMP31:%.*]] = icmp sgt i16 [[TMP7]], 0 -// CHECK2-NEXT: [[TMP32:%.*]] = and i1 [[TMP30]], [[TMP31]] -// CHECK2-NEXT: [[TMP33:%.*]] = or i1 [[TMP23]], [[TMP26]] -// CHECK2-NEXT: [[TMP34:%.*]] = or i1 [[TMP33]], [[TMP32]] -// CHECK2-NEXT: br i1 [[TMP34]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK2: then: -// CHECK2-NEXT: [[TMP35:%.*]] = bitcast [1 x i8*]* [[TMP5]] to i8* -// CHECK2-NEXT: [[TMP36:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* -// CHECK2-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP35]], i8* [[TMP36]]) #[[ATTR3]] -// CHECK2-NEXT: br label [[IFCONT:%.*]] -// CHECK2: else: -// CHECK2-NEXT: br label [[IFCONT]] -// CHECK2: ifcont: -// CHECK2-NEXT: [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK2-NEXT: [[TMP38:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] -// CHECK2-NEXT: [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]] -// CHECK2-NEXT: br i1 [[TMP39]], label [[THEN4:%.*]], label [[ELSE5:%.*]] -// CHECK2: then4: -// CHECK2-NEXT: [[TMP40:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP41:%.*]] = load i8*, i8** [[TMP40]], align 8 -// CHECK2-NEXT: [[TMP42:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP43:%.*]] = load i8*, i8** [[TMP42]], align 8 -// CHECK2-NEXT: [[TMP44:%.*]] = bitcast i8* [[TMP41]] to double* -// CHECK2-NEXT: [[TMP45:%.*]] = bitcast i8* [[TMP43]] to double* -// CHECK2-NEXT: [[TMP46:%.*]] = load double, double* [[TMP44]], align 8 -// CHECK2-NEXT: store double [[TMP46]], double* [[TMP45]], align 8 -// CHECK2-NEXT: br label [[IFCONT6:%.*]] -// CHECK2: else5: -// CHECK2-NEXT: br label [[IFCONT6]] -// CHECK2: ifcont6: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func -// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTCNT_ADDR:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK2-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK2-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 -// CHECK2-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK2-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 -// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* -// CHECK2-NEXT: store i32 0, i32* [[DOTCNT_ADDR]], align 4 -// CHECK2-NEXT: br label [[PRECOND:%.*]] -// CHECK2: precond: -// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCNT_ADDR]], align 4 -// CHECK2-NEXT: [[TMP6:%.*]] = icmp ult i32 [[TMP5]], 2 -// CHECK2-NEXT: br i1 [[TMP6]], label [[BODY:%.*]], label [[EXIT:%.*]] -// CHECK2: body: -// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP2]]) -// CHECK2-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK2-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK2: then: -// CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP8:%.*]] = load i8*, i8** [[TMP7]], align 8 -// CHECK2-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32* -// CHECK2-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[TMP9]], i32 [[TMP5]] -// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 -// CHECK2-NEXT: store volatile i32 [[TMP12]], i32 addrspace(3)* [[TMP11]], align 4 -// CHECK2-NEXT: br label [[IFCONT:%.*]] -// CHECK2: else: -// CHECK2-NEXT: br label [[IFCONT]] -// CHECK2: ifcont: -// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) -// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP13]] -// CHECK2-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] -// CHECK2: then4: -// CHECK2-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK2-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 8 -// CHECK2-NEXT: [[TMP17:%.*]] = bitcast i8* [[TMP16]] to i32* -// CHECK2-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP17]], i32 [[TMP5]] -// CHECK2-NEXT: [[TMP19:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4 -// CHECK2-NEXT: store i32 [[TMP19]], i32* [[TMP18]], align 4 -// CHECK2-NEXT: br label [[IFCONT6:%.*]] -// CHECK2: else5: -// CHECK2-NEXT: br label [[IFCONT6]] -// CHECK2: ifcont6: -// CHECK2-NEXT: [[TMP20:%.*]] = add nsw i32 [[TMP5]], 1 -// CHECK2-NEXT: store i32 [[TMP20]], i32* [[DOTCNT_ADDR]], align 4 -// CHECK2-NEXT: br label [[PRECOND]] -// CHECK2: exit: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func -// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 -// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 -// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* -// CHECK2-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.1* -// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8 -// CHECK2-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double* -// CHECK2-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 0 -// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP7]] -// CHECK2-NEXT: [[TMP12:%.*]] = load double, double* [[TMP10]], align 8 -// CHECK2-NEXT: store double [[TMP12]], double* [[TMP11]], align 128 -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func -// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 8 -// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 -// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.1* -// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 -// CHECK2-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP4]], i32 0, i32 0 -// CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP5]] -// CHECK2-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8* -// CHECK2-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 8 -// CHECK2-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK2-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 -// CHECK2-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP9]], i8* [[TMP10]]) #[[ATTR3]] -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func -// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 -// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 -// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* -// CHECK2-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.1* -// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8 -// CHECK2-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double* -// CHECK2-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 0 -// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP7]] -// CHECK2-NEXT: [[TMP12:%.*]] = load double, double* [[TMP11]], align 128 -// CHECK2-NEXT: store double [[TMP12]], double* [[TMP10]], align 8 -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func -// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 8 -// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 -// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.1* -// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 -// CHECK2-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP4]], i32 0, i32 0 -// CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP5]] -// CHECK2-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8* -// CHECK2-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 8 -// CHECK2-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK2-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 -// CHECK2-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP10]], i8* [[TMP9]]) #[[ATTR3]] -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29_worker -// CHECK2-SAME: () #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 -// CHECK2-NEXT: store i8* null, i8** [[WORK_FN]], align 8 -// CHECK2-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 -// CHECK2-NEXT: br label [[DOTAWAIT_WORK:%.*]] -// CHECK2: .await.work: -// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK2-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) -// CHECK2-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 -// CHECK2-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 -// CHECK2-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 -// CHECK2-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null -// CHECK2-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] -// CHECK2: .select.workers: -// CHECK2-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 -// CHECK2-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK2-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] -// CHECK2: .execute.parallel: -// CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* -// CHECK2-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) -// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] -// CHECK2: .terminate.parallel: -// CHECK2-NEXT: call void @__kmpc_kernel_end_parallel() -// CHECK2-NEXT: br label [[DOTBARRIER_PARALLEL]] -// CHECK2: .barrier.parallel: -// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK2-NEXT: br label [[DOTAWAIT_WORK]] -// CHECK2: .exit: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29 -// CHECK2-SAME: (i64 [[C:%.*]], i64 [[D:%.*]]) #[[ATTR1]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[D_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK2-NEXT: store i64 [[C]], i64* [[C_ADDR]], align 8 -// CHECK2-NEXT: store i64 [[D]], i64* [[D_ADDR]], align 8 -// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[C_ADDR]] to i8* -// CHECK2-NEXT: [[CONV1:%.*]] = bitcast i64* [[D_ADDR]] to float* -// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK2-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] -// CHECK2-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] -// CHECK2-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] -// CHECK2: .worker: -// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29_worker() #[[ATTR3]] -// CHECK2-NEXT: br label [[DOTEXIT:%.*]] -// CHECK2: .mastercheck: -// CHECK2-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK2-NEXT: [[NVPTX_NUM_THREADS3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: [[NVPTX_WARP_SIZE4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK2-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE4]], 1 -// CHECK2-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS3]], 1 -// CHECK2-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 -// CHECK2-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] -// CHECK2-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID2]], [[MASTER_TID]] -// CHECK2-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] -// CHECK2: .master: -// CHECK2-NEXT: [[NVPTX_NUM_THREADS5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: [[NVPTX_WARP_SIZE6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK2-NEXT: [[THREAD_LIMIT7:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS5]], [[NVPTX_WARP_SIZE6]] -// CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT7]], i16 1) -// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK2-NEXT: [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i64 8, i16 1) -// CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.2* -// CHECK2-NEXT: [[TMP7:%.*]] = load i8, i8* [[CONV]], align 8 -// CHECK2-NEXT: [[C8:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], %struct._globalized_locals_ty.2* [[TMP6]], i32 0, i32 1 -// CHECK2-NEXT: store i8 [[TMP7]], i8* [[C8]], align 4 -// CHECK2-NEXT: [[TMP8:%.*]] = load float, float* [[CONV1]], align 8 -// CHECK2-NEXT: [[D9:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], %struct._globalized_locals_ty.2* [[TMP6]], i32 0, i32 0 -// CHECK2-NEXT: store float [[TMP8]], float* [[D9]], align 4 -// CHECK2-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK2-NEXT: store i32 [[TMP9]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK2-NEXT: call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8* [[C8]], float* [[D9]]) #[[ATTR3]] -// CHECK2-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]]) -// CHECK2-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] -// CHECK2: .termination.notifier: -// CHECK2-NEXT: call void @__kmpc_kernel_deinit(i16 1) -// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK2-NEXT: br label [[DOTEXIT]] -// CHECK2: .exit: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8* nonnull align 1 dereferenceable(1) [[C:%.*]], float* nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: [[D_ADDR:%.*]] = alloca float*, align 8 -// CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8 -// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: store i8* [[C]], i8** [[C_ADDR]], align 8 -// CHECK2-NEXT: store float* [[D]], float** [[D_ADDR]], align 8 -// CHECK2-NEXT: [[TMP0:%.*]] = load i8*, i8** [[C_ADDR]], align 8 -// CHECK2-NEXT: [[TMP1:%.*]] = load float*, float** [[D_ADDR]], align 8 -// CHECK2-NEXT: [[TMP2:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i64 8, i16 1) -// CHECK2-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to %struct._globalized_locals_ty.3* -// CHECK2-NEXT: [[C1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP3]], i32 0, i32 1 -// CHECK2-NEXT: [[D2:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP3]], i32 0, i32 0 -// CHECK2-NEXT: store i8 0, i8* [[C1]], align 4 -// CHECK2-NEXT: store float 1.000000e+00, float* [[D2]], align 4 -// CHECK2-NEXT: [[TMP4:%.*]] = load i8, i8* [[C1]], align 4 -// CHECK2-NEXT: [[CONV:%.*]] = sext i8 [[TMP4]] to i32 -// CHECK2-NEXT: [[XOR:%.*]] = xor i32 [[CONV]], 2 -// CHECK2-NEXT: [[CONV3:%.*]] = trunc i32 [[XOR]] to i8 -// CHECK2-NEXT: store i8 [[CONV3]], i8* [[C1]], align 4 -// CHECK2-NEXT: [[TMP5:%.*]] = load float, float* [[D2]], align 4 -// CHECK2-NEXT: [[MUL:%.*]] = fmul float [[TMP5]], 3.300000e+01 -// CHECK2-NEXT: store float [[MUL]], float* [[D2]], align 4 -// CHECK2-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 -// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 -// CHECK2-NEXT: store i8* [[C1]], i8** [[TMP8]], align 8 -// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP10:%.*]] = bitcast float* [[D2]] to i8* -// CHECK2-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 8 -// CHECK2-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK2-NEXT: [[TMP12:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 8 -// CHECK2-NEXT: [[TMP13:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], i8* [[TMP12]], i32 1024, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func3, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func4, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func5, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func6, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func7, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func8) -// CHECK2-NEXT: [[TMP14:%.*]] = icmp eq i32 [[TMP13]], 1 -// CHECK2-NEXT: br i1 [[TMP14]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] -// CHECK2: .omp.reduction.then: -// CHECK2-NEXT: [[TMP15:%.*]] = load i8, i8* [[TMP0]], align 1 -// CHECK2-NEXT: [[CONV4:%.*]] = sext i8 [[TMP15]] to i32 -// CHECK2-NEXT: [[TMP16:%.*]] = load i8, i8* [[C1]], align 4 -// CHECK2-NEXT: [[CONV5:%.*]] = sext i8 [[TMP16]] to i32 -// CHECK2-NEXT: [[XOR6:%.*]] = xor i32 [[CONV4]], [[CONV5]] -// CHECK2-NEXT: [[CONV7:%.*]] = trunc i32 [[XOR6]] to i8 -// CHECK2-NEXT: store i8 [[CONV7]], i8* [[TMP0]], align 1 -// CHECK2-NEXT: [[TMP17:%.*]] = load float, float* [[TMP1]], align 4 -// CHECK2-NEXT: [[TMP18:%.*]] = load float, float* [[D2]], align 4 -// CHECK2-NEXT: [[MUL8:%.*]] = fmul float [[TMP17]], [[TMP18]] -// CHECK2-NEXT: store float [[MUL8]], float* [[TMP1]], align 4 -// CHECK2-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP7]]) -// CHECK2-NEXT: br label [[DOTOMP_REDUCTION_DONE]] -// CHECK2: .omp.reduction.done: -// CHECK2-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP2]]) -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func3 -// CHECK2-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 -// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 -// CHECK2-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 -// CHECK2-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 8 -// CHECK2-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i8, align 1 -// CHECK2-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca float, align 4 -// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 -// CHECK2-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 -// CHECK2-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 -// CHECK2-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* -// CHECK2-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 -// CHECK2-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 -// CHECK2-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 -// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 8 -// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[TMP10]], i64 1 -// CHECK2-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP10]], align 1 -// CHECK2-NEXT: [[TMP14:%.*]] = sext i8 [[TMP13]] to i32 -// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK2-NEXT: [[TMP15:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 -// CHECK2-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP14]], i16 [[TMP7]], i16 [[TMP15]]) -// CHECK2-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8 -// CHECK2-NEXT: store i8 [[TMP17]], i8* [[DOTOMP_REDUCTION_ELEMENT]], align 1 -// CHECK2-NEXT: [[TMP18:%.*]] = getelementptr i8, i8* [[TMP10]], i64 1 -// CHECK2-NEXT: [[TMP19:%.*]] = getelementptr i8, i8* [[DOTOMP_REDUCTION_ELEMENT]], i64 1 -// CHECK2-NEXT: store i8* [[DOTOMP_REDUCTION_ELEMENT]], i8** [[TMP11]], align 8 -// CHECK2-NEXT: [[TMP20:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP21:%.*]] = load i8*, i8** [[TMP20]], align 8 -// CHECK2-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP23:%.*]] = bitcast i8* [[TMP21]] to float* -// CHECK2-NEXT: [[TMP24:%.*]] = getelementptr float, float* [[TMP23]], i64 1 -// CHECK2-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to i8* -// CHECK2-NEXT: [[TMP26:%.*]] = bitcast float* [[TMP23]] to i32* -// CHECK2-NEXT: [[TMP27:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i32* -// CHECK2-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP26]], align 4 -// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK2-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 -// CHECK2-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) -// CHECK2-NEXT: store i32 [[TMP30]], i32* [[TMP27]], align 4 -// CHECK2-NEXT: [[TMP31:%.*]] = getelementptr i32, i32* [[TMP26]], i64 1 -// CHECK2-NEXT: [[TMP32:%.*]] = getelementptr i32, i32* [[TMP27]], i64 1 -// CHECK2-NEXT: [[TMP33:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* -// CHECK2-NEXT: store i8* [[TMP33]], i8** [[TMP22]], align 8 -// CHECK2-NEXT: [[TMP34:%.*]] = icmp eq i16 [[TMP8]], 0 -// CHECK2-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK2-NEXT: [[TMP36:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] -// CHECK2-NEXT: [[TMP37:%.*]] = and i1 [[TMP35]], [[TMP36]] -// CHECK2-NEXT: [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 2 -// CHECK2-NEXT: [[TMP39:%.*]] = and i16 [[TMP6]], 1 -// CHECK2-NEXT: [[TMP40:%.*]] = icmp eq i16 [[TMP39]], 0 -// CHECK2-NEXT: [[TMP41:%.*]] = and i1 [[TMP38]], [[TMP40]] -// CHECK2-NEXT: [[TMP42:%.*]] = icmp sgt i16 [[TMP7]], 0 -// CHECK2-NEXT: [[TMP43:%.*]] = and i1 [[TMP41]], [[TMP42]] -// CHECK2-NEXT: [[TMP44:%.*]] = or i1 [[TMP34]], [[TMP37]] -// CHECK2-NEXT: [[TMP45:%.*]] = or i1 [[TMP44]], [[TMP43]] -// CHECK2-NEXT: br i1 [[TMP45]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK2: then: -// CHECK2-NEXT: [[TMP46:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* -// CHECK2-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* -// CHECK2-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP46]], i8* [[TMP47]]) #[[ATTR3]] -// CHECK2-NEXT: br label [[IFCONT:%.*]] -// CHECK2: else: -// CHECK2-NEXT: br label [[IFCONT]] -// CHECK2: ifcont: -// CHECK2-NEXT: [[TMP48:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK2-NEXT: [[TMP49:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] -// CHECK2-NEXT: [[TMP50:%.*]] = and i1 [[TMP48]], [[TMP49]] -// CHECK2-NEXT: br i1 [[TMP50]], label [[THEN6:%.*]], label [[ELSE7:%.*]] -// CHECK2: then6: -// CHECK2-NEXT: [[TMP51:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP52:%.*]] = load i8*, i8** [[TMP51]], align 8 -// CHECK2-NEXT: [[TMP53:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP54:%.*]] = load i8*, i8** [[TMP53]], align 8 -// CHECK2-NEXT: [[TMP55:%.*]] = load i8, i8* [[TMP52]], align 1 -// CHECK2-NEXT: store i8 [[TMP55]], i8* [[TMP54]], align 1 -// CHECK2-NEXT: [[TMP56:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP57:%.*]] = load i8*, i8** [[TMP56]], align 8 -// CHECK2-NEXT: [[TMP58:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP59:%.*]] = load i8*, i8** [[TMP58]], align 8 -// CHECK2-NEXT: [[TMP60:%.*]] = bitcast i8* [[TMP57]] to float* -// CHECK2-NEXT: [[TMP61:%.*]] = bitcast i8* [[TMP59]] to float* -// CHECK2-NEXT: [[TMP62:%.*]] = load float, float* [[TMP60]], align 4 -// CHECK2-NEXT: store float [[TMP62]], float* [[TMP61]], align 4 -// CHECK2-NEXT: br label [[IFCONT8:%.*]] -// CHECK2: else7: -// CHECK2-NEXT: br label [[IFCONT8]] -// CHECK2: ifcont8: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func4 -// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK2-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK2-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 -// CHECK2-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK2-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 -// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) -// CHECK2-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK2-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK2: then: -// CHECK2-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 8 -// CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK2-NEXT: [[TMP8:%.*]] = bitcast i32 addrspace(3)* [[TMP7]] to i8 addrspace(3)* -// CHECK2-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP6]], align 1 -// CHECK2-NEXT: store volatile i8 [[TMP9]], i8 addrspace(3)* [[TMP8]], align 1 -// CHECK2-NEXT: br label [[IFCONT:%.*]] -// CHECK2: else: -// CHECK2-NEXT: br label [[IFCONT]] -// CHECK2: ifcont: -// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) -// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] -// CHECK2-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] -// CHECK2: then4: -// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK2-NEXT: [[TMP12:%.*]] = bitcast i32 addrspace(3)* [[TMP11]] to i8 addrspace(3)* -// CHECK2-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 8 -// CHECK2-NEXT: [[TMP15:%.*]] = load volatile i8, i8 addrspace(3)* [[TMP12]], align 1 -// CHECK2-NEXT: store i8 [[TMP15]], i8* [[TMP14]], align 1 -// CHECK2-NEXT: br label [[IFCONT6:%.*]] -// CHECK2: else5: -// CHECK2-NEXT: br label [[IFCONT6]] -// CHECK2: ifcont6: -// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) -// CHECK2-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK2-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] -// CHECK2: then8: -// CHECK2-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 8 -// CHECK2-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i32* -// CHECK2-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK2-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP18]], align 4 -// CHECK2-NEXT: store volatile i32 [[TMP20]], i32 addrspace(3)* [[TMP19]], align 4 -// CHECK2-NEXT: br label [[IFCONT10:%.*]] -// CHECK2: else9: -// CHECK2-NEXT: br label [[IFCONT10]] -// CHECK2: ifcont10: -// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) -// CHECK2-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP21]] -// CHECK2-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] -// CHECK2: then12: -// CHECK2-NEXT: [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK2-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP24:%.*]] = load i8*, i8** [[TMP23]], align 8 -// CHECK2-NEXT: [[TMP25:%.*]] = bitcast i8* [[TMP24]] to i32* -// CHECK2-NEXT: [[TMP26:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP22]], align 4 -// CHECK2-NEXT: store i32 [[TMP26]], i32* [[TMP25]], align 4 -// CHECK2-NEXT: br label [[IFCONT14:%.*]] -// CHECK2: else13: -// CHECK2-NEXT: br label [[IFCONT14]] -// CHECK2: ifcont14: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func5 -// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 -// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 -// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK2-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.4* -// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8 -// CHECK2-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 0 -// CHECK2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP7]] -// CHECK2-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]], align 1 -// CHECK2-NEXT: store i8 [[TMP11]], i8* [[TMP10]], align 128 -// CHECK2-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 8 -// CHECK2-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float* -// CHECK2-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 1 -// CHECK2-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP7]] -// CHECK2-NEXT: [[TMP16:%.*]] = load float, float* [[TMP14]], align 4 -// CHECK2-NEXT: store float [[TMP16]], float* [[TMP15]], align 128 -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func6 -// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8 -// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 -// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.4* -// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 -// CHECK2-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 0 -// CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP5]] -// CHECK2-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 8 -// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1 -// CHECK2-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 1 -// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP5]] -// CHECK2-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8* -// CHECK2-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 8 -// CHECK2-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK2-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 -// CHECK2-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP11]], i8* [[TMP12]]) #[[ATTR3]] -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func7 -// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 -// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 -// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK2-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.4* -// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8 -// CHECK2-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 0 -// CHECK2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP7]] -// CHECK2-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP10]], align 128 -// CHECK2-NEXT: store i8 [[TMP11]], i8* [[TMP9]], align 1 -// CHECK2-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 8 -// CHECK2-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float* -// CHECK2-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 1 -// CHECK2-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP7]] -// CHECK2-NEXT: [[TMP16:%.*]] = load float, float* [[TMP15]], align 128 -// CHECK2-NEXT: store float [[TMP16]], float* [[TMP14]], align 4 -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func8 -// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8 -// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 -// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.4* -// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 -// CHECK2-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 0 -// CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP5]] -// CHECK2-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 8 -// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1 -// CHECK2-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 1 -// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP5]] -// CHECK2-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8* -// CHECK2-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 8 -// CHECK2-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK2-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 -// CHECK2-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP12]], i8* [[TMP11]]) #[[ATTR3]] -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l36 -// CHECK2-SAME: (i64 [[A:%.*]], i64 [[B:%.*]]) #[[ATTR1]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK2-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 -// CHECK2-NEXT: store i64 [[B]], i64* [[B_ADDR]], align 8 -// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32* -// CHECK2-NEXT: [[CONV1:%.*]] = bitcast i64* [[B_ADDR]] to i16* -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() -// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK2: .execute: -// CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) -// CHECK2-NEXT: store i32 [[TMP0]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK2-NEXT: call void @__omp_outlined__9(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[CONV]], i16* [[CONV1]]) #[[ATTR3]] -// CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK2: .omp.deinit: -// CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) -// CHECK2-NEXT: br label [[DOTEXIT:%.*]] -// CHECK2: .exit: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__9 -// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 8 -// CHECK2-NEXT: [[A1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[B2:%.*]] = alloca i16, align 2 -// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 8 -// CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8 -// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 -// CHECK2-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 8 -// CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 -// CHECK2-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 8 -// CHECK2-NEXT: store i32 0, i32* [[A1]], align 4 -// CHECK2-NEXT: store i16 -32768, i16* [[B2]], align 2 -// CHECK2-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP3:%.*]] = bitcast i32* [[A1]] to i8* -// CHECK2-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 8 -// CHECK2-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i16* [[B2]] to i8* -// CHECK2-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 8 -// CHECK2-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 -// CHECK2-NEXT: [[TMP8:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*)* @__omp_outlined__10 to i8*), i8* null, i8** [[TMP8]], i64 2) -// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP10:%.*]] = bitcast i32* [[A1]] to i8* -// CHECK2-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 8 -// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP12:%.*]] = bitcast i16* [[B2]] to i8* -// CHECK2-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 8 -// CHECK2-NEXT: [[TMP13:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK2-NEXT: [[TMP14:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 8 -// CHECK2-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i8* [[TMP14]], i32 1024, i8* [[TMP13]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func15, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func16, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func17, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func18, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func19, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func20) -// CHECK2-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP15]], 1 -// CHECK2-NEXT: br i1 [[TMP16]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] -// CHECK2: .omp.reduction.then: -// CHECK2-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP0]], align 4 -// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[A1]], align 4 -// CHECK2-NEXT: [[OR:%.*]] = or i32 [[TMP17]], [[TMP18]] -// CHECK2-NEXT: store i32 [[OR]], i32* [[TMP0]], align 4 -// CHECK2-NEXT: [[TMP19:%.*]] = load i16, i16* [[TMP1]], align 2 -// CHECK2-NEXT: [[CONV:%.*]] = sext i16 [[TMP19]] to i32 -// CHECK2-NEXT: [[TMP20:%.*]] = load i16, i16* [[B2]], align 2 -// CHECK2-NEXT: [[CONV3:%.*]] = sext i16 [[TMP20]] to i32 -// CHECK2-NEXT: [[CMP:%.*]] = icmp sgt i32 [[CONV]], [[CONV3]] -// CHECK2-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK2: cond.true: -// CHECK2-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP1]], align 2 -// CHECK2-NEXT: br label [[COND_END:%.*]] -// CHECK2: cond.false: -// CHECK2-NEXT: [[TMP22:%.*]] = load i16, i16* [[B2]], align 2 -// CHECK2-NEXT: br label [[COND_END]] -// CHECK2: cond.end: -// CHECK2-NEXT: [[COND:%.*]] = phi i16 [ [[TMP21]], [[COND_TRUE]] ], [ [[TMP22]], [[COND_FALSE]] ] -// CHECK2-NEXT: store i16 [[COND]], i16* [[TMP1]], align 2 -// CHECK2-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP7]]) -// CHECK2-NEXT: br label [[DOTOMP_REDUCTION_DONE]] -// CHECK2: .omp.reduction.done: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__10 -// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 8 -// CHECK2-NEXT: [[A1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[B2:%.*]] = alloca i16, align 2 -// CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8 -// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 -// CHECK2-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 8 -// CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 -// CHECK2-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 8 -// CHECK2-NEXT: store i32 0, i32* [[A1]], align 4 -// CHECK2-NEXT: store i16 -32768, i16* [[B2]], align 2 -// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[A1]], align 4 -// CHECK2-NEXT: [[OR:%.*]] = or i32 [[TMP2]], 1 -// CHECK2-NEXT: store i32 [[OR]], i32* [[A1]], align 4 -// CHECK2-NEXT: [[TMP3:%.*]] = load i16, i16* [[B2]], align 2 -// CHECK2-NEXT: [[CONV:%.*]] = sext i16 [[TMP3]] to i32 -// CHECK2-NEXT: [[CMP:%.*]] = icmp sgt i32 99, [[CONV]] -// CHECK2-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK2: cond.true: -// CHECK2-NEXT: br label [[COND_END:%.*]] -// CHECK2: cond.false: -// CHECK2-NEXT: [[TMP4:%.*]] = load i16, i16* [[B2]], align 2 -// CHECK2-NEXT: [[CONV3:%.*]] = sext i16 [[TMP4]] to i32 -// CHECK2-NEXT: br label [[COND_END]] -// CHECK2: cond.end: -// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[CONV3]], [[COND_FALSE]] ] -// CHECK2-NEXT: [[CONV4:%.*]] = trunc i32 [[COND]] to i16 -// CHECK2-NEXT: store i16 [[CONV4]], i16* [[B2]], align 2 -// CHECK2-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 -// CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP8:%.*]] = bitcast i32* [[A1]] to i8* -// CHECK2-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 8 -// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP10:%.*]] = bitcast i16* [[B2]] to i8* -// CHECK2-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 8 -// CHECK2-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK2-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP6]], i32 2, i64 16, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func12, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func13) -// CHECK2-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP12]], 1 -// CHECK2-NEXT: br i1 [[TMP13]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] -// CHECK2: .omp.reduction.then: -// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP0]], align 4 -// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[A1]], align 4 -// CHECK2-NEXT: [[OR5:%.*]] = or i32 [[TMP14]], [[TMP15]] -// CHECK2-NEXT: store i32 [[OR5]], i32* [[TMP0]], align 4 -// CHECK2-NEXT: [[TMP16:%.*]] = load i16, i16* [[TMP1]], align 2 -// CHECK2-NEXT: [[CONV6:%.*]] = sext i16 [[TMP16]] to i32 -// CHECK2-NEXT: [[TMP17:%.*]] = load i16, i16* [[B2]], align 2 -// CHECK2-NEXT: [[CONV7:%.*]] = sext i16 [[TMP17]] to i32 -// CHECK2-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[CONV6]], [[CONV7]] -// CHECK2-NEXT: br i1 [[CMP8]], label [[COND_TRUE9:%.*]], label [[COND_FALSE10:%.*]] -// CHECK2: cond.true9: -// CHECK2-NEXT: [[TMP18:%.*]] = load i16, i16* [[TMP1]], align 2 -// CHECK2-NEXT: br label [[COND_END11:%.*]] -// CHECK2: cond.false10: -// CHECK2-NEXT: [[TMP19:%.*]] = load i16, i16* [[B2]], align 2 -// CHECK2-NEXT: br label [[COND_END11]] -// CHECK2: cond.end11: -// CHECK2-NEXT: [[COND12:%.*]] = phi i16 [ [[TMP18]], [[COND_TRUE9]] ], [ [[TMP19]], [[COND_FALSE10]] ] -// CHECK2-NEXT: store i16 [[COND12]], i16* [[TMP1]], align 2 -// CHECK2-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP6]]) -// CHECK2-NEXT: br label [[DOTOMP_REDUCTION_DONE]] -// CHECK2: .omp.reduction.done: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func12 -// CHECK2-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 -// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 -// CHECK2-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 -// CHECK2-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 8 -// CHECK2-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 -// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 -// CHECK2-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 -// CHECK2-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 -// CHECK2-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* -// CHECK2-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 -// CHECK2-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 -// CHECK2-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 -// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 8 -// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* -// CHECK2-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i64 1 -// CHECK2-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* -// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 -// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK2-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 -// CHECK2-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]]) -// CHECK2-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 -// CHECK2-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i64 1 -// CHECK2-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i64 1 -// CHECK2-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* -// CHECK2-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 8 -// CHECK2-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 8 -// CHECK2-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16* -// CHECK2-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i64 1 -// CHECK2-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8* -// CHECK2-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2 -// CHECK2-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32 -// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK2-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 -// CHECK2-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) -// CHECK2-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 -// CHECK2-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 -// CHECK2-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i64 1 -// CHECK2-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i64 1 -// CHECK2-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* -// CHECK2-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 8 -// CHECK2-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0 -// CHECK2-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK2-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] -// CHECK2-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]] -// CHECK2-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2 -// CHECK2-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1 -// CHECK2-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0 -// CHECK2-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]] -// CHECK2-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0 -// CHECK2-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]] -// CHECK2-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]] -// CHECK2-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]] -// CHECK2-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK2: then: -// CHECK2-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* -// CHECK2-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* -// CHECK2-NEXT: call void @"_omp$reduction$reduction_func11"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]] -// CHECK2-NEXT: br label [[IFCONT:%.*]] -// CHECK2: else: -// CHECK2-NEXT: br label [[IFCONT]] -// CHECK2: ifcont: -// CHECK2-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK2-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] -// CHECK2-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]] -// CHECK2-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]] -// CHECK2: then6: -// CHECK2-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 8 -// CHECK2-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 8 -// CHECK2-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32* -// CHECK2-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32* -// CHECK2-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4 -// CHECK2-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4 -// CHECK2-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 8 -// CHECK2-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 8 -// CHECK2-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16* -// CHECK2-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16* -// CHECK2-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2 -// CHECK2-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2 -// CHECK2-NEXT: br label [[IFCONT8:%.*]] -// CHECK2: else7: -// CHECK2-NEXT: br label [[IFCONT8]] -// CHECK2: ifcont8: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func13 -// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK2-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK2-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 -// CHECK2-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK2-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 -// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4:[0-9]+]], i32 [[TMP2]]) -// CHECK2-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK2-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK2: then: -// CHECK2-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 8 -// CHECK2-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* -// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK2-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4 -// CHECK2-NEXT: br label [[IFCONT:%.*]] -// CHECK2: else: -// CHECK2-NEXT: br label [[IFCONT]] -// CHECK2: ifcont: -// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] -// CHECK2-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] -// CHECK2: then4: -// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK2-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 8 -// CHECK2-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32* -// CHECK2-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4 -// CHECK2-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4 -// CHECK2-NEXT: br label [[IFCONT6:%.*]] -// CHECK2: else5: -// CHECK2-NEXT: br label [[IFCONT6]] -// CHECK2: ifcont6: -// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK2-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK2-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] -// CHECK2: then8: -// CHECK2-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 8 -// CHECK2-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16* -// CHECK2-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK2-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)* -// CHECK2-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2 -// CHECK2-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2 -// CHECK2-NEXT: br label [[IFCONT10:%.*]] -// CHECK2: else9: -// CHECK2-NEXT: br label [[IFCONT10]] -// CHECK2: ifcont10: -// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK2-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]] -// CHECK2-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] -// CHECK2: then12: -// CHECK2-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK2-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)* -// CHECK2-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 8 -// CHECK2-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16* -// CHECK2-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2 -// CHECK2-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2 -// CHECK2-NEXT: br label [[IFCONT14:%.*]] -// CHECK2: else13: -// CHECK2-NEXT: br label [[IFCONT14]] -// CHECK2: ifcont14: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func15 -// CHECK2-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 -// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 -// CHECK2-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 -// CHECK2-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 8 -// CHECK2-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 -// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 -// CHECK2-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 -// CHECK2-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 -// CHECK2-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* -// CHECK2-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 -// CHECK2-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 -// CHECK2-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 -// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 8 -// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* -// CHECK2-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i64 1 -// CHECK2-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* -// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 -// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK2-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 -// CHECK2-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]]) -// CHECK2-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 -// CHECK2-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i64 1 -// CHECK2-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i64 1 -// CHECK2-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* -// CHECK2-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 8 -// CHECK2-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 8 -// CHECK2-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16* -// CHECK2-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i64 1 -// CHECK2-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8* -// CHECK2-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2 -// CHECK2-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32 -// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK2-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 -// CHECK2-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) -// CHECK2-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 -// CHECK2-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 -// CHECK2-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i64 1 -// CHECK2-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i64 1 -// CHECK2-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* -// CHECK2-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 8 -// CHECK2-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0 -// CHECK2-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK2-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] -// CHECK2-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]] -// CHECK2-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2 -// CHECK2-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1 -// CHECK2-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0 -// CHECK2-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]] -// CHECK2-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0 -// CHECK2-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]] -// CHECK2-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]] -// CHECK2-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]] -// CHECK2-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK2: then: -// CHECK2-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* -// CHECK2-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* -// CHECK2-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]] -// CHECK2-NEXT: br label [[IFCONT:%.*]] -// CHECK2: else: -// CHECK2-NEXT: br label [[IFCONT]] -// CHECK2: ifcont: -// CHECK2-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK2-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] -// CHECK2-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]] -// CHECK2-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]] -// CHECK2: then6: -// CHECK2-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 8 -// CHECK2-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 8 -// CHECK2-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32* -// CHECK2-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32* -// CHECK2-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4 -// CHECK2-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4 -// CHECK2-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 8 -// CHECK2-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 8 -// CHECK2-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16* -// CHECK2-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16* -// CHECK2-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2 -// CHECK2-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2 -// CHECK2-NEXT: br label [[IFCONT8:%.*]] -// CHECK2: else7: -// CHECK2-NEXT: br label [[IFCONT8]] -// CHECK2: ifcont8: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func16 -// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK2-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK2-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 -// CHECK2-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK2-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 -// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK2-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK2-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK2: then: -// CHECK2-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 8 -// CHECK2-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* -// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK2-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4 -// CHECK2-NEXT: br label [[IFCONT:%.*]] -// CHECK2: else: -// CHECK2-NEXT: br label [[IFCONT]] -// CHECK2: ifcont: -// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] -// CHECK2-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] -// CHECK2: then4: -// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK2-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 8 -// CHECK2-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32* -// CHECK2-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4 -// CHECK2-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4 -// CHECK2-NEXT: br label [[IFCONT6:%.*]] -// CHECK2: else5: -// CHECK2-NEXT: br label [[IFCONT6]] -// CHECK2: ifcont6: -// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK2-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK2-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] -// CHECK2: then8: -// CHECK2-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 8 -// CHECK2-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16* -// CHECK2-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK2-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)* -// CHECK2-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2 -// CHECK2-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2 -// CHECK2-NEXT: br label [[IFCONT10:%.*]] -// CHECK2: else9: -// CHECK2-NEXT: br label [[IFCONT10]] -// CHECK2: ifcont10: -// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK2-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]] -// CHECK2-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] -// CHECK2: then12: -// CHECK2-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK2-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)* -// CHECK2-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 8 -// CHECK2-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16* -// CHECK2-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2 -// CHECK2-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2 -// CHECK2-NEXT: br label [[IFCONT14:%.*]] -// CHECK2: else13: -// CHECK2-NEXT: br label [[IFCONT14]] -// CHECK2: ifcont14: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func17 -// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 -// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 -// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK2-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.5* -// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8 -// CHECK2-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* -// CHECK2-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_5:%.*]], %struct._globalized_locals_ty.5* [[TMP6]], i32 0, i32 0 -// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP7]] -// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 -// CHECK2-NEXT: store i32 [[TMP12]], i32* [[TMP11]], align 128 -// CHECK2-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 8 -// CHECK2-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16* -// CHECK2-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_5]], %struct._globalized_locals_ty.5* [[TMP6]], i32 0, i32 1 -// CHECK2-NEXT: [[TMP16:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP7]] -// CHECK2-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP15]], align 2 -// CHECK2-NEXT: store i16 [[TMP17]], i16* [[TMP16]], align 128 -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func18 -// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8 -// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 -// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.5* -// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 -// CHECK2-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_5:%.*]], %struct._globalized_locals_ty.5* [[TMP4]], i32 0, i32 0 -// CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP5]] -// CHECK2-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* -// CHECK2-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 8 -// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1 -// CHECK2-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_5]], %struct._globalized_locals_ty.5* [[TMP4]], i32 0, i32 1 -// CHECK2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP5]] -// CHECK2-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8* -// CHECK2-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 8 -// CHECK2-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK2-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 -// CHECK2-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP12]], i8* [[TMP13]]) #[[ATTR3]] -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func19 -// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 -// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 -// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK2-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.5* -// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8 -// CHECK2-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* -// CHECK2-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_5:%.*]], %struct._globalized_locals_ty.5* [[TMP6]], i32 0, i32 0 -// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP7]] -// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 128 -// CHECK2-NEXT: store i32 [[TMP12]], i32* [[TMP10]], align 4 -// CHECK2-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 8 -// CHECK2-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16* -// CHECK2-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_5]], %struct._globalized_locals_ty.5* [[TMP6]], i32 0, i32 1 -// CHECK2-NEXT: [[TMP16:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP7]] -// CHECK2-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP16]], align 128 -// CHECK2-NEXT: store i16 [[TMP17]], i16* [[TMP15]], align 2 -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func20 -// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8 -// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 -// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.5* -// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 -// CHECK2-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_5:%.*]], %struct._globalized_locals_ty.5* [[TMP4]], i32 0, i32 0 -// CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP5]] -// CHECK2-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* -// CHECK2-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 8 -// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1 -// CHECK2-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_5]], %struct._globalized_locals_ty.5* [[TMP4]], i32 0, i32 1 -// CHECK2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP5]] -// CHECK2-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8* -// CHECK2-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 8 -// CHECK2-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK2-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 -// CHECK2-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP13]], i8* [[TMP12]]) #[[ATTR3]] -// CHECK2-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_worker -// CHECK3-SAME: () #[[ATTR0:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 -// CHECK3-NEXT: store i8* null, i8** [[WORK_FN]], align 4 -// CHECK3-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 -// CHECK3-NEXT: br label [[DOTAWAIT_WORK:%.*]] -// CHECK3: .await.work: -// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK3-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) -// CHECK3-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 -// CHECK3-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 -// CHECK3-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 -// CHECK3-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null -// CHECK3-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] -// CHECK3: .select.workers: -// CHECK3-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 -// CHECK3-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK3-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] -// CHECK3: .execute.parallel: -// CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) -// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* -// CHECK3-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) -// CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] -// CHECK3: .terminate.parallel: -// CHECK3-NEXT: call void @__kmpc_kernel_end_parallel() -// CHECK3-NEXT: br label [[DOTBARRIER_PARALLEL]] -// CHECK3: .barrier.parallel: -// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK3-NEXT: br label [[DOTAWAIT_WORK]] -// CHECK3: .exit: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23 -// CHECK3-SAME: (double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[E_ADDR:%.*]] = alloca double*, align 4 -// CHECK3-NEXT: [[E7:%.*]] = alloca double, align 8 -// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK3-NEXT: store double* [[E]], double** [[E_ADDR]], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4 -// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK3-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] -// CHECK3-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] -// CHECK3-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] -// CHECK3: .worker: -// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_worker() #[[ATTR3:[0-9]+]] -// CHECK3-NEXT: br label [[DOTEXIT:%.*]] -// CHECK3: .mastercheck: -// CHECK3-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK3-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK3-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 -// CHECK3-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 -// CHECK3-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1 -// CHECK3-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]] -// CHECK3-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] -// CHECK3-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] -// CHECK3: .master: -// CHECK3-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK3-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] -// CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK3-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK3-NEXT: [[TMP7:%.*]] = load double, double* [[TMP0]], align 8 -// CHECK3-NEXT: store double [[TMP7]], double* [[E7]], align 8 -// CHECK3-NEXT: store i32 [[TMP6]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK3-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], double* [[E7]]) #[[ATTR3]] -// CHECK3-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] -// CHECK3: .termination.notifier: -// CHECK3-NEXT: call void @__kmpc_kernel_deinit(i16 1) -// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK3-NEXT: br label [[DOTEXIT]] -// CHECK3: .exit: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: [[E_ADDR:%.*]] = alloca double*, align 4 -// CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 -// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK3-NEXT: store double* [[E]], double** [[E_ADDR]], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4 -// CHECK3-NEXT: [[TMP1:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* @"_openmp_static_kernel$size", align 4 -// CHECK3-NEXT: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP2]], i16 [[TMP1]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) -// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4 -// CHECK3-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP3]], i32 0 -// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to %struct._globalized_locals_ty* -// CHECK3-NEXT: [[E1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP5]], i32 0, i32 0 -// CHECK3-NEXT: store double 0.000000e+00, double* [[E1]], align 8 -// CHECK3-NEXT: [[TMP6:%.*]] = load double, double* [[E1]], align 8 -// CHECK3-NEXT: [[ADD:%.*]] = fadd double [[TMP6]], 5.000000e+00 -// CHECK3-NEXT: store double [[ADD]], double* [[E1]], align 8 -// CHECK3-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP10:%.*]] = bitcast double* [[E1]] to i8* -// CHECK3-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 -// CHECK3-NEXT: [[TMP11:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK3-NEXT: [[TMP12:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 -// CHECK3-NEXT: [[TMP13:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP8]], i8* [[TMP12]], i32 1024, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func) -// CHECK3-NEXT: [[TMP14:%.*]] = icmp eq i32 [[TMP13]], 1 -// CHECK3-NEXT: br i1 [[TMP14]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] -// CHECK3: .omp.reduction.then: -// CHECK3-NEXT: [[TMP15:%.*]] = load double, double* [[TMP0]], align 8 -// CHECK3-NEXT: [[TMP16:%.*]] = load double, double* [[E1]], align 8 -// CHECK3-NEXT: [[ADD2:%.*]] = fadd double [[TMP15]], [[TMP16]] -// CHECK3-NEXT: store double [[ADD2]], double* [[TMP0]], align 8 -// CHECK3-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP8]]) -// CHECK3-NEXT: br label [[DOTOMP_REDUCTION_DONE]] -// CHECK3: .omp.reduction.done: -// CHECK3-NEXT: [[TMP17:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK3-NEXT: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP17]]) -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func -// CHECK3-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 -// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 -// CHECK3-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 -// CHECK3-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x i8*], align 4 -// CHECK3-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca double, align 8 -// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 -// CHECK3-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 -// CHECK3-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 -// CHECK3-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [1 x i8*]* -// CHECK3-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 -// CHECK3-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 -// CHECK3-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 -// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 -// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to double* -// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr double, double* [[TMP12]], i32 1 -// CHECK3-NEXT: [[TMP14:%.*]] = bitcast double* [[TMP13]] to i8* -// CHECK3-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP12]] to i64* -// CHECK3-NEXT: [[TMP16:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i64* -// CHECK3-NEXT: [[TMP17:%.*]] = load i64, i64* [[TMP15]], align 8 -// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK3-NEXT: [[TMP18:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 -// CHECK3-NEXT: [[TMP19:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP17]], i16 [[TMP7]], i16 [[TMP18]]) -// CHECK3-NEXT: store i64 [[TMP19]], i64* [[TMP16]], align 8 -// CHECK3-NEXT: [[TMP20:%.*]] = getelementptr i64, i64* [[TMP15]], i32 1 -// CHECK3-NEXT: [[TMP21:%.*]] = getelementptr i64, i64* [[TMP16]], i32 1 -// CHECK3-NEXT: [[TMP22:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i8* -// CHECK3-NEXT: store i8* [[TMP22]], i8** [[TMP11]], align 4 -// CHECK3-NEXT: [[TMP23:%.*]] = icmp eq i16 [[TMP8]], 0 -// CHECK3-NEXT: [[TMP24:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK3-NEXT: [[TMP25:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] -// CHECK3-NEXT: [[TMP26:%.*]] = and i1 [[TMP24]], [[TMP25]] -// CHECK3-NEXT: [[TMP27:%.*]] = icmp eq i16 [[TMP8]], 2 -// CHECK3-NEXT: [[TMP28:%.*]] = and i16 [[TMP6]], 1 -// CHECK3-NEXT: [[TMP29:%.*]] = icmp eq i16 [[TMP28]], 0 -// CHECK3-NEXT: [[TMP30:%.*]] = and i1 [[TMP27]], [[TMP29]] -// CHECK3-NEXT: [[TMP31:%.*]] = icmp sgt i16 [[TMP7]], 0 -// CHECK3-NEXT: [[TMP32:%.*]] = and i1 [[TMP30]], [[TMP31]] -// CHECK3-NEXT: [[TMP33:%.*]] = or i1 [[TMP23]], [[TMP26]] -// CHECK3-NEXT: [[TMP34:%.*]] = or i1 [[TMP33]], [[TMP32]] -// CHECK3-NEXT: br i1 [[TMP34]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK3: then: -// CHECK3-NEXT: [[TMP35:%.*]] = bitcast [1 x i8*]* [[TMP5]] to i8* -// CHECK3-NEXT: [[TMP36:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* -// CHECK3-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP35]], i8* [[TMP36]]) #[[ATTR3]] -// CHECK3-NEXT: br label [[IFCONT:%.*]] -// CHECK3: else: -// CHECK3-NEXT: br label [[IFCONT]] -// CHECK3: ifcont: -// CHECK3-NEXT: [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK3-NEXT: [[TMP38:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] -// CHECK3-NEXT: [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]] -// CHECK3-NEXT: br i1 [[TMP39]], label [[THEN4:%.*]], label [[ELSE5:%.*]] -// CHECK3: then4: -// CHECK3-NEXT: [[TMP40:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP41:%.*]] = load i8*, i8** [[TMP40]], align 4 -// CHECK3-NEXT: [[TMP42:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP43:%.*]] = load i8*, i8** [[TMP42]], align 4 -// CHECK3-NEXT: [[TMP44:%.*]] = bitcast i8* [[TMP41]] to double* -// CHECK3-NEXT: [[TMP45:%.*]] = bitcast i8* [[TMP43]] to double* -// CHECK3-NEXT: [[TMP46:%.*]] = load double, double* [[TMP44]], align 8 -// CHECK3-NEXT: store double [[TMP46]], double* [[TMP45]], align 8 -// CHECK3-NEXT: br label [[IFCONT6:%.*]] -// CHECK3: else5: -// CHECK3-NEXT: br label [[IFCONT6]] -// CHECK3: ifcont6: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func -// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTCNT_ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK3-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK3-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 -// CHECK3-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK3-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 -// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* -// CHECK3-NEXT: store i32 0, i32* [[DOTCNT_ADDR]], align 4 -// CHECK3-NEXT: br label [[PRECOND:%.*]] -// CHECK3: precond: -// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCNT_ADDR]], align 4 -// CHECK3-NEXT: [[TMP6:%.*]] = icmp ult i32 [[TMP5]], 2 -// CHECK3-NEXT: br i1 [[TMP6]], label [[BODY:%.*]], label [[EXIT:%.*]] -// CHECK3: body: -// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP2]]) -// CHECK3-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK3-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK3: then: -// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP8:%.*]] = load i8*, i8** [[TMP7]], align 4 -// CHECK3-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32* -// CHECK3-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[TMP9]], i32 [[TMP5]] -// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 -// CHECK3-NEXT: store volatile i32 [[TMP12]], i32 addrspace(3)* [[TMP11]], align 4 -// CHECK3-NEXT: br label [[IFCONT:%.*]] -// CHECK3: else: -// CHECK3-NEXT: br label [[IFCONT]] -// CHECK3: ifcont: -// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) -// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK3-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP13]] -// CHECK3-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] -// CHECK3: then4: -// CHECK3-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK3-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 4 -// CHECK3-NEXT: [[TMP17:%.*]] = bitcast i8* [[TMP16]] to i32* -// CHECK3-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP17]], i32 [[TMP5]] -// CHECK3-NEXT: [[TMP19:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4 -// CHECK3-NEXT: store i32 [[TMP19]], i32* [[TMP18]], align 4 -// CHECK3-NEXT: br label [[IFCONT6:%.*]] -// CHECK3: else5: -// CHECK3-NEXT: br label [[IFCONT6]] -// CHECK3: ifcont6: -// CHECK3-NEXT: [[TMP20:%.*]] = add nsw i32 [[TMP5]], 1 -// CHECK3-NEXT: store i32 [[TMP20]], i32* [[DOTCNT_ADDR]], align 4 -// CHECK3-NEXT: br label [[PRECOND]] -// CHECK3: exit: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func -// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* -// CHECK3-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0* -// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 -// CHECK3-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double* -// CHECK3-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP7]] -// CHECK3-NEXT: [[TMP12:%.*]] = load double, double* [[TMP10]], align 8 -// CHECK3-NEXT: store double [[TMP12]], double* [[TMP11]], align 128 -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func -// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 -// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0* -// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK3-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP5]] -// CHECK3-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8* -// CHECK3-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 -// CHECK3-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK3-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK3-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP9]], i8* [[TMP10]]) #[[ATTR3]] -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func -// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* -// CHECK3-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0* -// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 -// CHECK3-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double* -// CHECK3-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP7]] -// CHECK3-NEXT: [[TMP12:%.*]] = load double, double* [[TMP11]], align 128 -// CHECK3-NEXT: store double [[TMP12]], double* [[TMP10]], align 8 -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func -// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 -// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0* -// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK3-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP5]] -// CHECK3-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8* -// CHECK3-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 -// CHECK3-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK3-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK3-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP10]], i8* [[TMP9]]) #[[ATTR3]] -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29_worker -// CHECK3-SAME: () #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 -// CHECK3-NEXT: store i8* null, i8** [[WORK_FN]], align 4 -// CHECK3-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 -// CHECK3-NEXT: br label [[DOTAWAIT_WORK:%.*]] -// CHECK3: .await.work: -// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK3-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) -// CHECK3-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 -// CHECK3-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 -// CHECK3-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 -// CHECK3-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null -// CHECK3-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] -// CHECK3: .select.workers: -// CHECK3-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 -// CHECK3-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK3-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] -// CHECK3: .execute.parallel: -// CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* -// CHECK3-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) -// CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] -// CHECK3: .terminate.parallel: -// CHECK3-NEXT: call void @__kmpc_kernel_end_parallel() -// CHECK3-NEXT: br label [[DOTBARRIER_PARALLEL]] -// CHECK3: .barrier.parallel: -// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK3-NEXT: br label [[DOTAWAIT_WORK]] -// CHECK3: .exit: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29 -// CHECK3-SAME: (i32 [[C:%.*]], i32 [[D:%.*]]) #[[ATTR1]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[D_ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK3-NEXT: store i32 [[C]], i32* [[C_ADDR]], align 4 -// CHECK3-NEXT: store i32 [[D]], i32* [[D_ADDR]], align 4 -// CHECK3-NEXT: [[CONV:%.*]] = bitcast i32* [[C_ADDR]] to i8* -// CHECK3-NEXT: [[CONV1:%.*]] = bitcast i32* [[D_ADDR]] to float* -// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK3-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] -// CHECK3-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] -// CHECK3-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] -// CHECK3: .worker: -// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29_worker() #[[ATTR3]] -// CHECK3-NEXT: br label [[DOTEXIT:%.*]] -// CHECK3: .mastercheck: -// CHECK3-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK3-NEXT: [[NVPTX_NUM_THREADS3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: [[NVPTX_WARP_SIZE4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK3-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE4]], 1 -// CHECK3-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS3]], 1 -// CHECK3-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 -// CHECK3-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] -// CHECK3-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID2]], [[MASTER_TID]] -// CHECK3-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] -// CHECK3: .master: -// CHECK3-NEXT: [[NVPTX_NUM_THREADS5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: [[NVPTX_WARP_SIZE6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK3-NEXT: [[THREAD_LIMIT7:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS5]], [[NVPTX_WARP_SIZE6]] -// CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT7]], i16 1) -// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK3-NEXT: [[TMP5:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared1", align 2 -// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* @"_openmp_static_kernel$size2", align 4 -// CHECK3-NEXT: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP6]], i16 [[TMP5]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) -// CHECK3-NEXT: [[TMP7:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4 -// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP7]], i32 0 -// CHECK3-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to %struct._globalized_locals_ty.1* -// CHECK3-NEXT: [[TMP10:%.*]] = load i8, i8* [[CONV]], align 4 -// CHECK3-NEXT: [[C8:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP9]], i32 0, i32 1 -// CHECK3-NEXT: store i8 [[TMP10]], i8* [[C8]], align 4 -// CHECK3-NEXT: [[TMP11:%.*]] = load float, float* [[CONV1]], align 4 -// CHECK3-NEXT: [[D9:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP9]], i32 0, i32 0 -// CHECK3-NEXT: store float [[TMP11]], float* [[D9]], align 4 -// CHECK3-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK3-NEXT: store i32 [[TMP12]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK3-NEXT: call void @__omp_outlined__3(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8* [[C8]], float* [[D9]]) #[[ATTR3]] -// CHECK3-NEXT: [[TMP13:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared1", align 2 -// CHECK3-NEXT: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP13]]) -// CHECK3-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] -// CHECK3: .termination.notifier: -// CHECK3-NEXT: call void @__kmpc_kernel_deinit(i16 1) -// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK3-NEXT: br label [[DOTEXIT]] -// CHECK3: .exit: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__3 -// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8* nonnull align 1 dereferenceable(1) [[C:%.*]], float* nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: [[D_ADDR:%.*]] = alloca float*, align 4 -// CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 -// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK3-NEXT: store i8* [[C]], i8** [[C_ADDR]], align 4 -// CHECK3-NEXT: store float* [[D]], float** [[D_ADDR]], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = load i8*, i8** [[C_ADDR]], align 4 -// CHECK3-NEXT: [[TMP1:%.*]] = load float*, float** [[D_ADDR]], align 4 -// CHECK3-NEXT: [[TMP2:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 8 -// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.2* -// CHECK3-NEXT: [[C1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], %struct._globalized_locals_ty.2* [[TMP4]], i32 0, i32 1 -// CHECK3-NEXT: [[D2:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], %struct._globalized_locals_ty.2* [[TMP4]], i32 0, i32 0 -// CHECK3-NEXT: store i8 0, i8* [[C1]], align 4 -// CHECK3-NEXT: store float 1.000000e+00, float* [[D2]], align 4 -// CHECK3-NEXT: [[TMP5:%.*]] = load i8, i8* [[C1]], align 4 -// CHECK3-NEXT: [[CONV:%.*]] = sext i8 [[TMP5]] to i32 -// CHECK3-NEXT: [[XOR:%.*]] = xor i32 [[CONV]], 2 -// CHECK3-NEXT: [[CONV3:%.*]] = trunc i32 [[XOR]] to i8 -// CHECK3-NEXT: store i8 [[CONV3]], i8* [[C1]], align 4 -// CHECK3-NEXT: [[TMP6:%.*]] = load float, float* [[D2]], align 4 -// CHECK3-NEXT: [[MUL:%.*]] = fmul float [[TMP6]], 3.300000e+01 -// CHECK3-NEXT: store float [[MUL]], float* [[D2]], align 4 -// CHECK3-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK3-NEXT: store i8* [[C1]], i8** [[TMP9]], align 4 -// CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP11:%.*]] = bitcast float* [[D2]] to i8* -// CHECK3-NEXT: store i8* [[TMP11]], i8** [[TMP10]], align 4 -// CHECK3-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK3-NEXT: [[TMP13:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 -// CHECK3-NEXT: [[TMP14:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP8]], i8* [[TMP13]], i32 1024, i8* [[TMP12]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func5, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func6, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func7, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func8, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func9, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func10) -// CHECK3-NEXT: [[TMP15:%.*]] = icmp eq i32 [[TMP14]], 1 -// CHECK3-NEXT: br i1 [[TMP15]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] -// CHECK3: .omp.reduction.then: -// CHECK3-NEXT: [[TMP16:%.*]] = load i8, i8* [[TMP0]], align 1 -// CHECK3-NEXT: [[CONV4:%.*]] = sext i8 [[TMP16]] to i32 -// CHECK3-NEXT: [[TMP17:%.*]] = load i8, i8* [[C1]], align 4 -// CHECK3-NEXT: [[CONV5:%.*]] = sext i8 [[TMP17]] to i32 -// CHECK3-NEXT: [[XOR6:%.*]] = xor i32 [[CONV4]], [[CONV5]] -// CHECK3-NEXT: [[CONV7:%.*]] = trunc i32 [[XOR6]] to i8 -// CHECK3-NEXT: store i8 [[CONV7]], i8* [[TMP0]], align 1 -// CHECK3-NEXT: [[TMP18:%.*]] = load float, float* [[TMP1]], align 4 -// CHECK3-NEXT: [[TMP19:%.*]] = load float, float* [[D2]], align 4 -// CHECK3-NEXT: [[MUL8:%.*]] = fmul float [[TMP18]], [[TMP19]] -// CHECK3-NEXT: store float [[MUL8]], float* [[TMP1]], align 4 -// CHECK3-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP8]]) -// CHECK3-NEXT: br label [[DOTOMP_REDUCTION_DONE]] -// CHECK3: .omp.reduction.done: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func5 -// CHECK3-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 -// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 -// CHECK3-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 -// CHECK3-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 -// CHECK3-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i8, align 1 -// CHECK3-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca float, align 4 -// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 -// CHECK3-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 -// CHECK3-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 -// CHECK3-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* -// CHECK3-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 -// CHECK3-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 -// CHECK3-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 -// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 -// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1 -// CHECK3-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP10]], align 1 -// CHECK3-NEXT: [[TMP14:%.*]] = sext i8 [[TMP13]] to i32 -// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK3-NEXT: [[TMP15:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 -// CHECK3-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP14]], i16 [[TMP7]], i16 [[TMP15]]) -// CHECK3-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8 -// CHECK3-NEXT: store i8 [[TMP17]], i8* [[DOTOMP_REDUCTION_ELEMENT]], align 1 -// CHECK3-NEXT: [[TMP18:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1 -// CHECK3-NEXT: [[TMP19:%.*]] = getelementptr i8, i8* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 -// CHECK3-NEXT: store i8* [[DOTOMP_REDUCTION_ELEMENT]], i8** [[TMP11]], align 4 -// CHECK3-NEXT: [[TMP20:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP21:%.*]] = load i8*, i8** [[TMP20]], align 4 -// CHECK3-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP23:%.*]] = bitcast i8* [[TMP21]] to float* -// CHECK3-NEXT: [[TMP24:%.*]] = getelementptr float, float* [[TMP23]], i32 1 -// CHECK3-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to i8* -// CHECK3-NEXT: [[TMP26:%.*]] = bitcast float* [[TMP23]] to i32* -// CHECK3-NEXT: [[TMP27:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i32* -// CHECK3-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP26]], align 4 -// CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK3-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 -// CHECK3-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) -// CHECK3-NEXT: store i32 [[TMP30]], i32* [[TMP27]], align 4 -// CHECK3-NEXT: [[TMP31:%.*]] = getelementptr i32, i32* [[TMP26]], i32 1 -// CHECK3-NEXT: [[TMP32:%.*]] = getelementptr i32, i32* [[TMP27]], i32 1 -// CHECK3-NEXT: [[TMP33:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* -// CHECK3-NEXT: store i8* [[TMP33]], i8** [[TMP22]], align 4 -// CHECK3-NEXT: [[TMP34:%.*]] = icmp eq i16 [[TMP8]], 0 -// CHECK3-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK3-NEXT: [[TMP36:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] -// CHECK3-NEXT: [[TMP37:%.*]] = and i1 [[TMP35]], [[TMP36]] -// CHECK3-NEXT: [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 2 -// CHECK3-NEXT: [[TMP39:%.*]] = and i16 [[TMP6]], 1 -// CHECK3-NEXT: [[TMP40:%.*]] = icmp eq i16 [[TMP39]], 0 -// CHECK3-NEXT: [[TMP41:%.*]] = and i1 [[TMP38]], [[TMP40]] -// CHECK3-NEXT: [[TMP42:%.*]] = icmp sgt i16 [[TMP7]], 0 -// CHECK3-NEXT: [[TMP43:%.*]] = and i1 [[TMP41]], [[TMP42]] -// CHECK3-NEXT: [[TMP44:%.*]] = or i1 [[TMP34]], [[TMP37]] -// CHECK3-NEXT: [[TMP45:%.*]] = or i1 [[TMP44]], [[TMP43]] -// CHECK3-NEXT: br i1 [[TMP45]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK3: then: -// CHECK3-NEXT: [[TMP46:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* -// CHECK3-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* -// CHECK3-NEXT: call void @"_omp$reduction$reduction_func4"(i8* [[TMP46]], i8* [[TMP47]]) #[[ATTR3]] -// CHECK3-NEXT: br label [[IFCONT:%.*]] -// CHECK3: else: -// CHECK3-NEXT: br label [[IFCONT]] -// CHECK3: ifcont: -// CHECK3-NEXT: [[TMP48:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK3-NEXT: [[TMP49:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] -// CHECK3-NEXT: [[TMP50:%.*]] = and i1 [[TMP48]], [[TMP49]] -// CHECK3-NEXT: br i1 [[TMP50]], label [[THEN6:%.*]], label [[ELSE7:%.*]] -// CHECK3: then6: -// CHECK3-NEXT: [[TMP51:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP52:%.*]] = load i8*, i8** [[TMP51]], align 4 -// CHECK3-NEXT: [[TMP53:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP54:%.*]] = load i8*, i8** [[TMP53]], align 4 -// CHECK3-NEXT: [[TMP55:%.*]] = load i8, i8* [[TMP52]], align 1 -// CHECK3-NEXT: store i8 [[TMP55]], i8* [[TMP54]], align 1 -// CHECK3-NEXT: [[TMP56:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP57:%.*]] = load i8*, i8** [[TMP56]], align 4 -// CHECK3-NEXT: [[TMP58:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP59:%.*]] = load i8*, i8** [[TMP58]], align 4 -// CHECK3-NEXT: [[TMP60:%.*]] = bitcast i8* [[TMP57]] to float* -// CHECK3-NEXT: [[TMP61:%.*]] = bitcast i8* [[TMP59]] to float* -// CHECK3-NEXT: [[TMP62:%.*]] = load float, float* [[TMP60]], align 4 -// CHECK3-NEXT: store float [[TMP62]], float* [[TMP61]], align 4 -// CHECK3-NEXT: br label [[IFCONT8:%.*]] -// CHECK3: else7: -// CHECK3-NEXT: br label [[IFCONT8]] -// CHECK3: ifcont8: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func6 -// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK3-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK3-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 -// CHECK3-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK3-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 -// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) -// CHECK3-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK3-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK3: then: -// CHECK3-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 -// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK3-NEXT: [[TMP8:%.*]] = bitcast i32 addrspace(3)* [[TMP7]] to i8 addrspace(3)* -// CHECK3-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP6]], align 1 -// CHECK3-NEXT: store volatile i8 [[TMP9]], i8 addrspace(3)* [[TMP8]], align 1 -// CHECK3-NEXT: br label [[IFCONT:%.*]] -// CHECK3: else: -// CHECK3-NEXT: br label [[IFCONT]] -// CHECK3: ifcont: -// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) -// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK3-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] -// CHECK3-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] -// CHECK3: then4: -// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK3-NEXT: [[TMP12:%.*]] = bitcast i32 addrspace(3)* [[TMP11]] to i8 addrspace(3)* -// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 -// CHECK3-NEXT: [[TMP15:%.*]] = load volatile i8, i8 addrspace(3)* [[TMP12]], align 1 -// CHECK3-NEXT: store i8 [[TMP15]], i8* [[TMP14]], align 1 -// CHECK3-NEXT: br label [[IFCONT6:%.*]] -// CHECK3: else5: -// CHECK3-NEXT: br label [[IFCONT6]] -// CHECK3: ifcont6: -// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) -// CHECK3-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK3-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] -// CHECK3: then8: -// CHECK3-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 -// CHECK3-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i32* -// CHECK3-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK3-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP18]], align 4 -// CHECK3-NEXT: store volatile i32 [[TMP20]], i32 addrspace(3)* [[TMP19]], align 4 -// CHECK3-NEXT: br label [[IFCONT10:%.*]] -// CHECK3: else9: -// CHECK3-NEXT: br label [[IFCONT10]] -// CHECK3: ifcont10: -// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) -// CHECK3-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK3-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP21]] -// CHECK3-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] -// CHECK3: then12: -// CHECK3-NEXT: [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK3-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP24:%.*]] = load i8*, i8** [[TMP23]], align 4 -// CHECK3-NEXT: [[TMP25:%.*]] = bitcast i8* [[TMP24]] to i32* -// CHECK3-NEXT: [[TMP26:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP22]], align 4 -// CHECK3-NEXT: store i32 [[TMP26]], i32* [[TMP25]], align 4 -// CHECK3-NEXT: br label [[IFCONT14:%.*]] -// CHECK3: else13: -// CHECK3-NEXT: br label [[IFCONT14]] -// CHECK3: ifcont14: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func7 -// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK3-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.3* -// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 -// CHECK3-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP7]] -// CHECK3-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]], align 1 -// CHECK3-NEXT: store i8 [[TMP11]], i8* [[TMP10]], align 128 -// CHECK3-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 -// CHECK3-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float* -// CHECK3-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP7]] -// CHECK3-NEXT: [[TMP16:%.*]] = load float, float* [[TMP14]], align 4 -// CHECK3-NEXT: store float [[TMP16]], float* [[TMP15]], align 128 -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func8 -// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 -// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.3* -// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK3-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP5]] -// CHECK3-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 -// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 -// CHECK3-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP5]] -// CHECK3-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8* -// CHECK3-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 4 -// CHECK3-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK3-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK3-NEXT: call void @"_omp$reduction$reduction_func4"(i8* [[TMP11]], i8* [[TMP12]]) #[[ATTR3]] -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func9 -// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK3-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.3* -// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 -// CHECK3-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP7]] -// CHECK3-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP10]], align 128 -// CHECK3-NEXT: store i8 [[TMP11]], i8* [[TMP9]], align 1 -// CHECK3-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 -// CHECK3-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float* -// CHECK3-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP7]] -// CHECK3-NEXT: [[TMP16:%.*]] = load float, float* [[TMP15]], align 128 -// CHECK3-NEXT: store float [[TMP16]], float* [[TMP14]], align 4 -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func10 -// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 -// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.3* -// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK3-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP5]] -// CHECK3-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 -// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 -// CHECK3-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP5]] -// CHECK3-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8* -// CHECK3-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 4 -// CHECK3-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK3-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK3-NEXT: call void @"_omp$reduction$reduction_func4"(i8* [[TMP12]], i8* [[TMP11]]) #[[ATTR3]] -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l36 -// CHECK3-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK3-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 -// CHECK3-NEXT: store i32 [[B]], i32* [[B_ADDR]], align 4 -// CHECK3-NEXT: [[CONV:%.*]] = bitcast i32* [[B_ADDR]] to i16* -// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() -// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK3: .execute: -// CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) -// CHECK3-NEXT: store i32 [[TMP0]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK3-NEXT: call void @__omp_outlined__11(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[A_ADDR]], i16* [[CONV]]) #[[ATTR3]] -// CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK3: .omp.deinit: -// CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) -// CHECK3-NEXT: br label [[DOTEXIT:%.*]] -// CHECK3: .exit: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__11 -// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 4 -// CHECK3-NEXT: [[A1:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[B2:%.*]] = alloca i16, align 2 -// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4 -// CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 -// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK3-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 -// CHECK3-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 -// CHECK3-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[A1]], align 4 -// CHECK3-NEXT: store i16 -32768, i16* [[B2]], align 2 -// CHECK3-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP3:%.*]] = bitcast i32* [[A1]] to i8* -// CHECK3-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 4 -// CHECK3-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i16* [[B2]] to i8* -// CHECK3-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 4 -// CHECK3-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 -// CHECK3-NEXT: [[TMP8:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*)* @__omp_outlined__12 to i8*), i8* null, i8** [[TMP8]], i32 2) -// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP10:%.*]] = bitcast i32* [[A1]] to i8* -// CHECK3-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 -// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP12:%.*]] = bitcast i16* [[B2]] to i8* -// CHECK3-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 -// CHECK3-NEXT: [[TMP13:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK3-NEXT: [[TMP14:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 -// CHECK3-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i8* [[TMP14]], i32 1024, i8* [[TMP13]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func17, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func18, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func19, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func20, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func21, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func22) -// CHECK3-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP15]], 1 -// CHECK3-NEXT: br i1 [[TMP16]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] -// CHECK3: .omp.reduction.then: -// CHECK3-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP0]], align 4 -// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[A1]], align 4 -// CHECK3-NEXT: [[OR:%.*]] = or i32 [[TMP17]], [[TMP18]] -// CHECK3-NEXT: store i32 [[OR]], i32* [[TMP0]], align 4 -// CHECK3-NEXT: [[TMP19:%.*]] = load i16, i16* [[TMP1]], align 2 -// CHECK3-NEXT: [[CONV:%.*]] = sext i16 [[TMP19]] to i32 -// CHECK3-NEXT: [[TMP20:%.*]] = load i16, i16* [[B2]], align 2 -// CHECK3-NEXT: [[CONV3:%.*]] = sext i16 [[TMP20]] to i32 -// CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 [[CONV]], [[CONV3]] -// CHECK3-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK3: cond.true: -// CHECK3-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP1]], align 2 -// CHECK3-NEXT: br label [[COND_END:%.*]] -// CHECK3: cond.false: -// CHECK3-NEXT: [[TMP22:%.*]] = load i16, i16* [[B2]], align 2 -// CHECK3-NEXT: br label [[COND_END]] -// CHECK3: cond.end: -// CHECK3-NEXT: [[COND:%.*]] = phi i16 [ [[TMP21]], [[COND_TRUE]] ], [ [[TMP22]], [[COND_FALSE]] ] -// CHECK3-NEXT: store i16 [[COND]], i16* [[TMP1]], align 2 -// CHECK3-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP7]]) -// CHECK3-NEXT: br label [[DOTOMP_REDUCTION_DONE]] -// CHECK3: .omp.reduction.done: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__12 -// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 4 -// CHECK3-NEXT: [[A1:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[B2:%.*]] = alloca i16, align 2 -// CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 -// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK3-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 -// CHECK3-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 -// CHECK3-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[A1]], align 4 -// CHECK3-NEXT: store i16 -32768, i16* [[B2]], align 2 -// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[A1]], align 4 -// CHECK3-NEXT: [[OR:%.*]] = or i32 [[TMP2]], 1 -// CHECK3-NEXT: store i32 [[OR]], i32* [[A1]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load i16, i16* [[B2]], align 2 -// CHECK3-NEXT: [[CONV:%.*]] = sext i16 [[TMP3]] to i32 -// CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 99, [[CONV]] -// CHECK3-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK3: cond.true: -// CHECK3-NEXT: br label [[COND_END:%.*]] -// CHECK3: cond.false: -// CHECK3-NEXT: [[TMP4:%.*]] = load i16, i16* [[B2]], align 2 -// CHECK3-NEXT: [[CONV3:%.*]] = sext i16 [[TMP4]] to i32 -// CHECK3-NEXT: br label [[COND_END]] -// CHECK3: cond.end: -// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[CONV3]], [[COND_FALSE]] ] -// CHECK3-NEXT: [[CONV4:%.*]] = trunc i32 [[COND]] to i16 -// CHECK3-NEXT: store i16 [[CONV4]], i16* [[B2]], align 2 -// CHECK3-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 -// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP8:%.*]] = bitcast i32* [[A1]] to i8* -// CHECK3-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 4 -// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP10:%.*]] = bitcast i16* [[B2]] to i8* -// CHECK3-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 -// CHECK3-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK3-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP6]], i32 2, i32 8, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func14, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func15) -// CHECK3-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP12]], 1 -// CHECK3-NEXT: br i1 [[TMP13]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] -// CHECK3: .omp.reduction.then: -// CHECK3-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP0]], align 4 -// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[A1]], align 4 -// CHECK3-NEXT: [[OR5:%.*]] = or i32 [[TMP14]], [[TMP15]] -// CHECK3-NEXT: store i32 [[OR5]], i32* [[TMP0]], align 4 -// CHECK3-NEXT: [[TMP16:%.*]] = load i16, i16* [[TMP1]], align 2 -// CHECK3-NEXT: [[CONV6:%.*]] = sext i16 [[TMP16]] to i32 -// CHECK3-NEXT: [[TMP17:%.*]] = load i16, i16* [[B2]], align 2 -// CHECK3-NEXT: [[CONV7:%.*]] = sext i16 [[TMP17]] to i32 -// CHECK3-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[CONV6]], [[CONV7]] -// CHECK3-NEXT: br i1 [[CMP8]], label [[COND_TRUE9:%.*]], label [[COND_FALSE10:%.*]] -// CHECK3: cond.true9: -// CHECK3-NEXT: [[TMP18:%.*]] = load i16, i16* [[TMP1]], align 2 -// CHECK3-NEXT: br label [[COND_END11:%.*]] -// CHECK3: cond.false10: -// CHECK3-NEXT: [[TMP19:%.*]] = load i16, i16* [[B2]], align 2 -// CHECK3-NEXT: br label [[COND_END11]] -// CHECK3: cond.end11: -// CHECK3-NEXT: [[COND12:%.*]] = phi i16 [ [[TMP18]], [[COND_TRUE9]] ], [ [[TMP19]], [[COND_FALSE10]] ] -// CHECK3-NEXT: store i16 [[COND12]], i16* [[TMP1]], align 2 -// CHECK3-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP6]]) -// CHECK3-NEXT: br label [[DOTOMP_REDUCTION_DONE]] -// CHECK3: .omp.reduction.done: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func14 -// CHECK3-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 -// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 -// CHECK3-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 -// CHECK3-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 -// CHECK3-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 -// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 -// CHECK3-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 -// CHECK3-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 -// CHECK3-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* -// CHECK3-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 -// CHECK3-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 -// CHECK3-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 -// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 -// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* -// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 -// CHECK3-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* -// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 -// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK3-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 -// CHECK3-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]]) -// CHECK3-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 -// CHECK3-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 -// CHECK3-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 -// CHECK3-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* -// CHECK3-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 4 -// CHECK3-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 4 -// CHECK3-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16* -// CHECK3-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 -// CHECK3-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8* -// CHECK3-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2 -// CHECK3-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32 -// CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK3-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 -// CHECK3-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) -// CHECK3-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 -// CHECK3-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 -// CHECK3-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 -// CHECK3-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1 -// CHECK3-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* -// CHECK3-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 4 -// CHECK3-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0 -// CHECK3-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK3-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] -// CHECK3-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]] -// CHECK3-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2 -// CHECK3-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1 -// CHECK3-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0 -// CHECK3-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]] -// CHECK3-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0 -// CHECK3-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]] -// CHECK3-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]] -// CHECK3-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]] -// CHECK3-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK3: then: -// CHECK3-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* -// CHECK3-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* -// CHECK3-NEXT: call void @"_omp$reduction$reduction_func13"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]] -// CHECK3-NEXT: br label [[IFCONT:%.*]] -// CHECK3: else: -// CHECK3-NEXT: br label [[IFCONT]] -// CHECK3: ifcont: -// CHECK3-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK3-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] -// CHECK3-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]] -// CHECK3-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]] -// CHECK3: then6: -// CHECK3-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 4 -// CHECK3-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4 -// CHECK3-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32* -// CHECK3-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32* -// CHECK3-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4 -// CHECK3-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4 -// CHECK3-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 4 -// CHECK3-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4 -// CHECK3-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16* -// CHECK3-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16* -// CHECK3-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2 -// CHECK3-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2 -// CHECK3-NEXT: br label [[IFCONT8:%.*]] -// CHECK3: else7: -// CHECK3-NEXT: br label [[IFCONT8]] -// CHECK3: ifcont8: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func15 -// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK3-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK3-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 -// CHECK3-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK3-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 -// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4:[0-9]+]], i32 [[TMP2]]) -// CHECK3-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK3-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK3: then: -// CHECK3-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 -// CHECK3-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* -// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK3-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4 -// CHECK3-NEXT: br label [[IFCONT:%.*]] -// CHECK3: else: -// CHECK3-NEXT: br label [[IFCONT]] -// CHECK3: ifcont: -// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK3-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] -// CHECK3-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] -// CHECK3: then4: -// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK3-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 -// CHECK3-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32* -// CHECK3-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4 -// CHECK3-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4 -// CHECK3-NEXT: br label [[IFCONT6:%.*]] -// CHECK3: else5: -// CHECK3-NEXT: br label [[IFCONT6]] -// CHECK3: ifcont6: -// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK3-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK3-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] -// CHECK3: then8: -// CHECK3-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 -// CHECK3-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16* -// CHECK3-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK3-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)* -// CHECK3-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2 -// CHECK3-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2 -// CHECK3-NEXT: br label [[IFCONT10:%.*]] -// CHECK3: else9: -// CHECK3-NEXT: br label [[IFCONT10]] -// CHECK3: ifcont10: -// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK3-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK3-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]] -// CHECK3-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] -// CHECK3: then12: -// CHECK3-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK3-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)* -// CHECK3-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 4 -// CHECK3-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16* -// CHECK3-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2 -// CHECK3-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2 -// CHECK3-NEXT: br label [[IFCONT14:%.*]] -// CHECK3: else13: -// CHECK3-NEXT: br label [[IFCONT14]] -// CHECK3: ifcont14: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func17 -// CHECK3-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 -// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 -// CHECK3-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 -// CHECK3-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 -// CHECK3-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 -// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 -// CHECK3-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 -// CHECK3-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 -// CHECK3-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* -// CHECK3-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 -// CHECK3-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 -// CHECK3-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 -// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 -// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* -// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 -// CHECK3-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* -// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 -// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK3-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 -// CHECK3-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]]) -// CHECK3-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 -// CHECK3-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 -// CHECK3-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 -// CHECK3-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* -// CHECK3-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 4 -// CHECK3-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 4 -// CHECK3-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16* -// CHECK3-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 -// CHECK3-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8* -// CHECK3-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2 -// CHECK3-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32 -// CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK3-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 -// CHECK3-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) -// CHECK3-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 -// CHECK3-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 -// CHECK3-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 -// CHECK3-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1 -// CHECK3-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* -// CHECK3-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 4 -// CHECK3-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0 -// CHECK3-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK3-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] -// CHECK3-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]] -// CHECK3-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2 -// CHECK3-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1 -// CHECK3-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0 -// CHECK3-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]] -// CHECK3-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0 -// CHECK3-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]] -// CHECK3-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]] -// CHECK3-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]] -// CHECK3-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK3: then: -// CHECK3-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* -// CHECK3-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* -// CHECK3-NEXT: call void @"_omp$reduction$reduction_func16"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]] -// CHECK3-NEXT: br label [[IFCONT:%.*]] -// CHECK3: else: -// CHECK3-NEXT: br label [[IFCONT]] -// CHECK3: ifcont: -// CHECK3-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK3-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] -// CHECK3-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]] -// CHECK3-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]] -// CHECK3: then6: -// CHECK3-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 4 -// CHECK3-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4 -// CHECK3-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32* -// CHECK3-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32* -// CHECK3-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4 -// CHECK3-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4 -// CHECK3-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 4 -// CHECK3-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4 -// CHECK3-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16* -// CHECK3-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16* -// CHECK3-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2 -// CHECK3-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2 -// CHECK3-NEXT: br label [[IFCONT8:%.*]] -// CHECK3: else7: -// CHECK3-NEXT: br label [[IFCONT8]] -// CHECK3: ifcont8: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func18 -// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK3-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK3-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 -// CHECK3-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK3-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 -// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK3-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK3-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK3: then: -// CHECK3-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 -// CHECK3-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* -// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK3-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4 -// CHECK3-NEXT: br label [[IFCONT:%.*]] -// CHECK3: else: -// CHECK3-NEXT: br label [[IFCONT]] -// CHECK3: ifcont: -// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK3-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] -// CHECK3-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] -// CHECK3: then4: -// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK3-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 -// CHECK3-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32* -// CHECK3-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4 -// CHECK3-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4 -// CHECK3-NEXT: br label [[IFCONT6:%.*]] -// CHECK3: else5: -// CHECK3-NEXT: br label [[IFCONT6]] -// CHECK3: ifcont6: -// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK3-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK3-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] -// CHECK3: then8: -// CHECK3-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 -// CHECK3-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16* -// CHECK3-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK3-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)* -// CHECK3-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2 -// CHECK3-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2 -// CHECK3-NEXT: br label [[IFCONT10:%.*]] -// CHECK3: else9: -// CHECK3-NEXT: br label [[IFCONT10]] -// CHECK3: ifcont10: -// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK3-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK3-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]] -// CHECK3-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] -// CHECK3: then12: -// CHECK3-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK3-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)* -// CHECK3-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 4 -// CHECK3-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16* -// CHECK3-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2 -// CHECK3-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2 -// CHECK3-NEXT: br label [[IFCONT14:%.*]] -// CHECK3: else13: -// CHECK3-NEXT: br label [[IFCONT14]] -// CHECK3: ifcont14: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func19 -// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK3-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.4* -// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 -// CHECK3-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* -// CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP7]] -// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 -// CHECK3-NEXT: store i32 [[TMP12]], i32* [[TMP11]], align 128 -// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 -// CHECK3-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16* -// CHECK3-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP16:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP7]] -// CHECK3-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP15]], align 2 -// CHECK3-NEXT: store i16 [[TMP17]], i16* [[TMP16]], align 128 -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func20 -// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 -// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.4* -// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP5]] -// CHECK3-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* -// CHECK3-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 -// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 -// CHECK3-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP5]] -// CHECK3-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8* -// CHECK3-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 4 -// CHECK3-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK3-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK3-NEXT: call void @"_omp$reduction$reduction_func16"(i8* [[TMP12]], i8* [[TMP13]]) #[[ATTR3]] -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func21 -// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK3-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.4* -// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 -// CHECK3-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* -// CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP7]] -// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 128 -// CHECK3-NEXT: store i32 [[TMP12]], i32* [[TMP10]], align 4 -// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 -// CHECK3-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16* -// CHECK3-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP16:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP7]] -// CHECK3-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP16]], align 128 -// CHECK3-NEXT: store i16 [[TMP17]], i16* [[TMP15]], align 2 -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func22 -// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 -// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.4* -// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP5]] -// CHECK3-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* -// CHECK3-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 -// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 -// CHECK3-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP5]] -// CHECK3-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8* -// CHECK3-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 4 -// CHECK3-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK3-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK3-NEXT: call void @"_omp$reduction$reduction_func16"(i8* [[TMP13]], i8* [[TMP12]]) #[[ATTR3]] -// CHECK3-NEXT: ret void -// -// // CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_worker // CHECK4-SAME: () #[[ATTR0:[0-9]+]] { // CHECK4-NEXT: entry: @@ -4422,7 +72,351 @@ // CHECK4-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 // CHECK4-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] // CHECK4: .execute.parallel: -// CHECK4-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK4-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK4-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK4-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK4-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK4: .terminate.parallel: +// CHECK4-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK4-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK4: .barrier.parallel: +// CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK4-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK4: .exit: +// CHECK4-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23 +// CHECK4-SAME: (double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1:[0-9]+]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[E_ADDR:%.*]] = alloca double*, align 4 +// CHECK4-NEXT: [[E7:%.*]] = alloca double, align 8 +// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK4-NEXT: store double* [[E]], double** [[E_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4 +// CHECK4-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK4-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK4-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK4-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK4: .worker: +// CHECK4-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_worker() #[[ATTR3:[0-9]+]] +// CHECK4-NEXT: br label [[DOTEXIT:%.*]] +// CHECK4: .mastercheck: +// CHECK4-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK4-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK4-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK4-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK4-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1 +// CHECK4-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]] +// CHECK4-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK4-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK4: .master: +// CHECK4-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK4-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK4-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK4-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK4-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK4-NEXT: [[TMP7:%.*]] = load double, double* [[TMP0]], align 8 +// CHECK4-NEXT: store double [[TMP7]], double* [[E7]], align 8 +// CHECK4-NEXT: store i32 [[TMP6]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK4-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], double* [[E7]]) #[[ATTR3]] +// CHECK4-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK4: .termination.notifier: +// CHECK4-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK4-NEXT: br label [[DOTEXIT]] +// CHECK4: .exit: +// CHECK4-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[E_ADDR:%.*]] = alloca double*, align 4 +// CHECK4-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK4-NEXT: store double* [[E]], double** [[E_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4 +// CHECK4-NEXT: [[TMP1:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* @"_openmp_static_kernel$size", align 4 +// CHECK4-NEXT: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP2]], i16 [[TMP1]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) +// CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP3]], i32 0 +// CHECK4-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to %struct._globalized_locals_ty* +// CHECK4-NEXT: [[E1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP5]], i32 0, i32 0 +// CHECK4-NEXT: store double 0.000000e+00, double* [[E1]], align 8 +// CHECK4-NEXT: [[TMP6:%.*]] = load double, double* [[E1]], align 8 +// CHECK4-NEXT: [[ADD:%.*]] = fadd double [[TMP6]], 5.000000e+00 +// CHECK4-NEXT: store double [[ADD]], double* [[E1]], align 8 +// CHECK4-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP10:%.*]] = bitcast double* [[E1]] to i8* +// CHECK4-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK4-NEXT: [[TMP11:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK4-NEXT: [[TMP12:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 +// CHECK4-NEXT: [[TMP13:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP8]], i8* [[TMP12]], i32 2048, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func) +// CHECK4-NEXT: [[TMP14:%.*]] = icmp eq i32 [[TMP13]], 1 +// CHECK4-NEXT: br i1 [[TMP14]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK4: .omp.reduction.then: +// CHECK4-NEXT: [[TMP15:%.*]] = load double, double* [[TMP0]], align 8 +// CHECK4-NEXT: [[TMP16:%.*]] = load double, double* [[E1]], align 8 +// CHECK4-NEXT: [[ADD2:%.*]] = fadd double [[TMP15]], [[TMP16]] +// CHECK4-NEXT: store double [[ADD2]], double* [[TMP0]], align 8 +// CHECK4-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP8]]) +// CHECK4-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK4: .omp.reduction.done: +// CHECK4-NEXT: [[TMP17:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK4-NEXT: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP17]]) +// CHECK4-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func +// CHECK4-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK4-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK4-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x i8*], align 4 +// CHECK4-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca double, align 8 +// CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK4-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK4-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK4-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK4-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK4-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [1 x i8*]* +// CHECK4-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK4-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK4-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 +// CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to double* +// CHECK4-NEXT: [[TMP13:%.*]] = getelementptr double, double* [[TMP12]], i32 1 +// CHECK4-NEXT: [[TMP14:%.*]] = bitcast double* [[TMP13]] to i8* +// CHECK4-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP12]] to i64* +// CHECK4-NEXT: [[TMP16:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i64* +// CHECK4-NEXT: [[TMP17:%.*]] = load i64, i64* [[TMP15]], align 8 +// CHECK4-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK4-NEXT: [[TMP18:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK4-NEXT: [[TMP19:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP17]], i16 [[TMP7]], i16 [[TMP18]]) +// CHECK4-NEXT: store i64 [[TMP19]], i64* [[TMP16]], align 8 +// CHECK4-NEXT: [[TMP20:%.*]] = getelementptr i64, i64* [[TMP15]], i32 1 +// CHECK4-NEXT: [[TMP21:%.*]] = getelementptr i64, i64* [[TMP16]], i32 1 +// CHECK4-NEXT: [[TMP22:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i8* +// CHECK4-NEXT: store i8* [[TMP22]], i8** [[TMP11]], align 4 +// CHECK4-NEXT: [[TMP23:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK4-NEXT: [[TMP24:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK4-NEXT: [[TMP25:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK4-NEXT: [[TMP26:%.*]] = and i1 [[TMP24]], [[TMP25]] +// CHECK4-NEXT: [[TMP27:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK4-NEXT: [[TMP28:%.*]] = and i16 [[TMP6]], 1 +// CHECK4-NEXT: [[TMP29:%.*]] = icmp eq i16 [[TMP28]], 0 +// CHECK4-NEXT: [[TMP30:%.*]] = and i1 [[TMP27]], [[TMP29]] +// CHECK4-NEXT: [[TMP31:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK4-NEXT: [[TMP32:%.*]] = and i1 [[TMP30]], [[TMP31]] +// CHECK4-NEXT: [[TMP33:%.*]] = or i1 [[TMP23]], [[TMP26]] +// CHECK4-NEXT: [[TMP34:%.*]] = or i1 [[TMP33]], [[TMP32]] +// CHECK4-NEXT: br i1 [[TMP34]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK4: then: +// CHECK4-NEXT: [[TMP35:%.*]] = bitcast [1 x i8*]* [[TMP5]] to i8* +// CHECK4-NEXT: [[TMP36:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK4-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP35]], i8* [[TMP36]]) #[[ATTR3]] +// CHECK4-NEXT: br label [[IFCONT:%.*]] +// CHECK4: else: +// CHECK4-NEXT: br label [[IFCONT]] +// CHECK4: ifcont: +// CHECK4-NEXT: [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK4-NEXT: [[TMP38:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK4-NEXT: [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]] +// CHECK4-NEXT: br i1 [[TMP39]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK4: then4: +// CHECK4-NEXT: [[TMP40:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP41:%.*]] = load i8*, i8** [[TMP40]], align 4 +// CHECK4-NEXT: [[TMP42:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP43:%.*]] = load i8*, i8** [[TMP42]], align 4 +// CHECK4-NEXT: [[TMP44:%.*]] = bitcast i8* [[TMP41]] to double* +// CHECK4-NEXT: [[TMP45:%.*]] = bitcast i8* [[TMP43]] to double* +// CHECK4-NEXT: [[TMP46:%.*]] = load double, double* [[TMP44]], align 8 +// CHECK4-NEXT: store double [[TMP46]], double* [[TMP45]], align 8 +// CHECK4-NEXT: br label [[IFCONT6:%.*]] +// CHECK4: else5: +// CHECK4-NEXT: br label [[IFCONT6]] +// CHECK4: ifcont6: +// CHECK4-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func +// CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCNT_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK4-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK4-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK4-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK4-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* +// CHECK4-NEXT: store i32 0, i32* [[DOTCNT_ADDR]], align 4 +// CHECK4-NEXT: br label [[PRECOND:%.*]] +// CHECK4: precond: +// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCNT_ADDR]], align 4 +// CHECK4-NEXT: [[TMP6:%.*]] = icmp ult i32 [[TMP5]], 2 +// CHECK4-NEXT: br i1 [[TMP6]], label [[BODY:%.*]], label [[EXIT:%.*]] +// CHECK4: body: +// CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP2]]) +// CHECK4-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK4-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK4: then: +// CHECK4-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP8:%.*]] = load i8*, i8** [[TMP7]], align 4 +// CHECK4-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32* +// CHECK4-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[TMP9]], i32 [[TMP5]] +// CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK4-NEXT: store volatile i32 [[TMP12]], i32 addrspace(3)* [[TMP11]], align 4 +// CHECK4-NEXT: br label [[IFCONT:%.*]] +// CHECK4: else: +// CHECK4-NEXT: br label [[IFCONT]] +// CHECK4: ifcont: +// CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK4-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP13]] +// CHECK4-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK4: then4: +// CHECK4-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK4-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 4 +// CHECK4-NEXT: [[TMP17:%.*]] = bitcast i8* [[TMP16]] to i32* +// CHECK4-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP17]], i32 [[TMP5]] +// CHECK4-NEXT: [[TMP19:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4 +// CHECK4-NEXT: store i32 [[TMP19]], i32* [[TMP18]], align 4 +// CHECK4-NEXT: br label [[IFCONT6:%.*]] +// CHECK4: else5: +// CHECK4-NEXT: br label [[IFCONT6]] +// CHECK4: ifcont6: +// CHECK4-NEXT: [[TMP20:%.*]] = add nsw i32 [[TMP5]], 1 +// CHECK4-NEXT: store i32 [[TMP20]], i32* [[DOTCNT_ADDR]], align 4 +// CHECK4-NEXT: br label [[PRECOND]] +// CHECK4: exit: +// CHECK4-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func +// CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* +// CHECK4-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK4-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0* +// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK4-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double* +// CHECK4-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP7]] +// CHECK4-NEXT: [[TMP12:%.*]] = load double, double* [[TMP10]], align 8 +// CHECK4-NEXT: store double [[TMP12]], double* [[TMP11]], align 128 +// CHECK4-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func +// CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK4-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 +// CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0* +// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK4-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP5]] +// CHECK4-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8* +// CHECK4-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 +// CHECK4-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK4-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK4-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP9]], i8* [[TMP10]]) #[[ATTR3]] +// CHECK4-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func +// CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* +// CHECK4-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK4-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0* +// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK4-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double* +// CHECK4-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP7]] +// CHECK4-NEXT: [[TMP12:%.*]] = load double, double* [[TMP11]], align 128 +// CHECK4-NEXT: store double [[TMP12]], double* [[TMP10]], align 8 +// CHECK4-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func +// CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK4-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 +// CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0* +// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK4-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP5]] +// CHECK4-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8* +// CHECK4-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 +// CHECK4-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK4-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK4-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP10]], i8* [[TMP9]]) #[[ATTR3]] +// CHECK4-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29_worker +// CHECK4-SAME: () #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK4-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK4-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK4-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK4-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK4: .await.work: +// CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK4-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK4-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK4-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK4-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK4-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK4-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK4: .select.workers: +// CHECK4-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK4-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK4-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK4: .execute.parallel: +// CHECK4-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) // CHECK4-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* // CHECK4-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) // CHECK4-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] @@ -4434,48 +428,60 @@ // CHECK4-NEXT: br label [[DOTAWAIT_WORK]] // CHECK4: .exit: // CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23 -// CHECK4-SAME: (double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1:[0-9]+]] { +// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29 +// CHECK4-SAME: (i32 [[C:%.*]], i32 [[D:%.*]]) #[[ATTR1]] { // CHECK4-NEXT: entry: -// CHECK4-NEXT: [[E_ADDR:%.*]] = alloca double*, align 4 -// CHECK4-NEXT: [[E7:%.*]] = alloca double, align 8 +// CHECK4-NEXT: [[C_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[D_ADDR:%.*]] = alloca i32, align 4 // CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 // CHECK4-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 // CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK4-NEXT: store double* [[E]], double** [[E_ADDR]], align 4 -// CHECK4-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[C]], i32* [[C_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[D]], i32* [[D_ADDR]], align 4 +// CHECK4-NEXT: [[CONV:%.*]] = bitcast i32* [[C_ADDR]] to i8* +// CHECK4-NEXT: [[CONV1:%.*]] = bitcast i32* [[D_ADDR]] to float* // CHECK4-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() // CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK4-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK4-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] -// CHECK4-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] -// CHECK4-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK4-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK4-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] // CHECK4: .worker: -// CHECK4-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_worker() #[[ATTR3:[0-9]+]] +// CHECK4-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29_worker() #[[ATTR3]] // CHECK4-NEXT: br label [[DOTEXIT:%.*]] // CHECK4: .mastercheck: -// CHECK4-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK4-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK4-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK4-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 -// CHECK4-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 -// CHECK4-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1 -// CHECK4-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]] -// CHECK4-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] -// CHECK4-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK4-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK4-NEXT: [[NVPTX_NUM_THREADS3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: [[NVPTX_WARP_SIZE4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK4-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE4]], 1 +// CHECK4-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS3]], 1 +// CHECK4-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK4-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK4-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID2]], [[MASTER_TID]] +// CHECK4-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] // CHECK4: .master: -// CHECK4-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK4-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK4-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] -// CHECK4-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK4-NEXT: [[NVPTX_NUM_THREADS5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: [[NVPTX_WARP_SIZE6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK4-NEXT: [[THREAD_LIMIT7:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS5]], [[NVPTX_WARP_SIZE6]] +// CHECK4-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT7]], i16 1) // CHECK4-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK4-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK4-NEXT: [[TMP7:%.*]] = load double, double* [[TMP0]], align 8 -// CHECK4-NEXT: store double [[TMP7]], double* [[E7]], align 8 -// CHECK4-NEXT: store i32 [[TMP6]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK4-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], double* [[E7]]) #[[ATTR3]] +// CHECK4-NEXT: [[TMP5:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared1", align 2 +// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* @"_openmp_static_kernel$size2", align 4 +// CHECK4-NEXT: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP6]], i16 [[TMP5]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) +// CHECK4-NEXT: [[TMP7:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4 +// CHECK4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP7]], i32 0 +// CHECK4-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to %struct._globalized_locals_ty.1* +// CHECK4-NEXT: [[TMP10:%.*]] = load i8, i8* [[CONV]], align 4 +// CHECK4-NEXT: [[C8:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP9]], i32 0, i32 1 +// CHECK4-NEXT: store i8 [[TMP10]], i8* [[C8]], align 4 +// CHECK4-NEXT: [[TMP11:%.*]] = load float, float* [[CONV1]], align 4 +// CHECK4-NEXT: [[D9:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP9]], i32 0, i32 0 +// CHECK4-NEXT: store float [[TMP11]], float* [[D9]], align 4 +// CHECK4-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK4-NEXT: store i32 [[TMP12]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK4-NEXT: call void @__omp_outlined__3(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8* [[C8]], float* [[D9]]) #[[ATTR3]] +// CHECK4-NEXT: [[TMP13:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared1", align 2 +// CHECK4-NEXT: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP13]]) // CHECK4-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] // CHECK4: .termination.notifier: // CHECK4-NEXT: call void @__kmpc_kernel_deinit(i16 1) @@ -4483,135 +489,163 @@ // CHECK4-NEXT: br label [[DOTEXIT]] // CHECK4: .exit: // CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1]] { +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8* nonnull align 1 dereferenceable(1) [[C:%.*]], float* nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] { // CHECK4-NEXT: entry: // CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 // CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[E_ADDR:%.*]] = alloca double*, align 4 -// CHECK4-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 +// CHECK4-NEXT: [[C_ADDR:%.*]] = alloca i8*, align 4 +// CHECK4-NEXT: [[D_ADDR:%.*]] = alloca float*, align 4 +// CHECK4-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 // CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 // CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK4-NEXT: store double* [[E]], double** [[E_ADDR]], align 4 -// CHECK4-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4 -// CHECK4-NEXT: [[TMP1:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* @"_openmp_static_kernel$size", align 4 -// CHECK4-NEXT: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP2]], i16 [[TMP1]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) -// CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4 -// CHECK4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP3]], i32 0 -// CHECK4-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to %struct._globalized_locals_ty* -// CHECK4-NEXT: [[E1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP5]], i32 0, i32 0 -// CHECK4-NEXT: store double 0.000000e+00, double* [[E1]], align 8 -// CHECK4-NEXT: [[TMP6:%.*]] = load double, double* [[E1]], align 8 -// CHECK4-NEXT: [[ADD:%.*]] = fadd double [[TMP6]], 5.000000e+00 -// CHECK4-NEXT: store double [[ADD]], double* [[E1]], align 8 +// CHECK4-NEXT: store i8* [[C]], i8** [[C_ADDR]], align 4 +// CHECK4-NEXT: store float* [[D]], float** [[D_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load i8*, i8** [[C_ADDR]], align 4 +// CHECK4-NEXT: [[TMP1:%.*]] = load float*, float** [[D_ADDR]], align 4 +// CHECK4-NEXT: [[TMP2:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 8 +// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.2* +// CHECK4-NEXT: [[C1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], %struct._globalized_locals_ty.2* [[TMP4]], i32 0, i32 1 +// CHECK4-NEXT: [[D2:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], %struct._globalized_locals_ty.2* [[TMP4]], i32 0, i32 0 +// CHECK4-NEXT: store i8 0, i8* [[C1]], align 4 +// CHECK4-NEXT: store float 1.000000e+00, float* [[D2]], align 4 +// CHECK4-NEXT: [[TMP5:%.*]] = load i8, i8* [[C1]], align 4 +// CHECK4-NEXT: [[CONV:%.*]] = sext i8 [[TMP5]] to i32 +// CHECK4-NEXT: [[XOR:%.*]] = xor i32 [[CONV]], 2 +// CHECK4-NEXT: [[CONV3:%.*]] = trunc i32 [[XOR]] to i8 +// CHECK4-NEXT: store i8 [[CONV3]], i8* [[C1]], align 4 +// CHECK4-NEXT: [[TMP6:%.*]] = load float, float* [[D2]], align 4 +// CHECK4-NEXT: [[MUL:%.*]] = fmul float [[TMP6]], 3.300000e+01 +// CHECK4-NEXT: store float [[MUL]], float* [[D2]], align 4 // CHECK4-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 // CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK4-NEXT: [[TMP10:%.*]] = bitcast double* [[E1]] to i8* -// CHECK4-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 -// CHECK4-NEXT: [[TMP11:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK4-NEXT: [[TMP12:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 -// CHECK4-NEXT: [[TMP13:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP8]], i8* [[TMP12]], i32 2048, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func) -// CHECK4-NEXT: [[TMP14:%.*]] = icmp eq i32 [[TMP13]], 1 -// CHECK4-NEXT: br i1 [[TMP14]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK4-NEXT: store i8* [[C1]], i8** [[TMP9]], align 4 +// CHECK4-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP11:%.*]] = bitcast float* [[D2]] to i8* +// CHECK4-NEXT: store i8* [[TMP11]], i8** [[TMP10]], align 4 +// CHECK4-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK4-NEXT: [[TMP13:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 +// CHECK4-NEXT: [[TMP14:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP8]], i8* [[TMP13]], i32 2048, i8* [[TMP12]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func5, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func6, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func7, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func8, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func9, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func10) +// CHECK4-NEXT: [[TMP15:%.*]] = icmp eq i32 [[TMP14]], 1 +// CHECK4-NEXT: br i1 [[TMP15]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] // CHECK4: .omp.reduction.then: -// CHECK4-NEXT: [[TMP15:%.*]] = load double, double* [[TMP0]], align 8 -// CHECK4-NEXT: [[TMP16:%.*]] = load double, double* [[E1]], align 8 -// CHECK4-NEXT: [[ADD2:%.*]] = fadd double [[TMP15]], [[TMP16]] -// CHECK4-NEXT: store double [[ADD2]], double* [[TMP0]], align 8 +// CHECK4-NEXT: [[TMP16:%.*]] = load i8, i8* [[TMP0]], align 1 +// CHECK4-NEXT: [[CONV4:%.*]] = sext i8 [[TMP16]] to i32 +// CHECK4-NEXT: [[TMP17:%.*]] = load i8, i8* [[C1]], align 4 +// CHECK4-NEXT: [[CONV5:%.*]] = sext i8 [[TMP17]] to i32 +// CHECK4-NEXT: [[XOR6:%.*]] = xor i32 [[CONV4]], [[CONV5]] +// CHECK4-NEXT: [[CONV7:%.*]] = trunc i32 [[XOR6]] to i8 +// CHECK4-NEXT: store i8 [[CONV7]], i8* [[TMP0]], align 1 +// CHECK4-NEXT: [[TMP18:%.*]] = load float, float* [[TMP1]], align 4 +// CHECK4-NEXT: [[TMP19:%.*]] = load float, float* [[D2]], align 4 +// CHECK4-NEXT: [[MUL8:%.*]] = fmul float [[TMP18]], [[TMP19]] +// CHECK4-NEXT: store float [[MUL8]], float* [[TMP1]], align 4 // CHECK4-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP8]]) // CHECK4-NEXT: br label [[DOTOMP_REDUCTION_DONE]] // CHECK4: .omp.reduction.done: -// CHECK4-NEXT: [[TMP17:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK4-NEXT: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP17]]) // CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func +// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func5 // CHECK4-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { // CHECK4-NEXT: entry: // CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 // CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 // CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 // CHECK4-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 -// CHECK4-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x i8*], align 4 -// CHECK4-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca double, align 8 +// CHECK4-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK4-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i8, align 1 +// CHECK4-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca float, align 4 // CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 // CHECK4-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 // CHECK4-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 // CHECK4-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 // CHECK4-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK4-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [1 x i8*]* +// CHECK4-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* // CHECK4-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 // CHECK4-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 // CHECK4-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 -// CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 // CHECK4-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 -// CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 -// CHECK4-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to double* -// CHECK4-NEXT: [[TMP13:%.*]] = getelementptr double, double* [[TMP12]], i32 1 -// CHECK4-NEXT: [[TMP14:%.*]] = bitcast double* [[TMP13]] to i8* -// CHECK4-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP12]] to i64* -// CHECK4-NEXT: [[TMP16:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i64* -// CHECK4-NEXT: [[TMP17:%.*]] = load i64, i64* [[TMP15]], align 8 +// CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1 +// CHECK4-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP10]], align 1 +// CHECK4-NEXT: [[TMP14:%.*]] = sext i8 [[TMP13]] to i32 // CHECK4-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK4-NEXT: [[TMP18:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 -// CHECK4-NEXT: [[TMP19:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP17]], i16 [[TMP7]], i16 [[TMP18]]) -// CHECK4-NEXT: store i64 [[TMP19]], i64* [[TMP16]], align 8 -// CHECK4-NEXT: [[TMP20:%.*]] = getelementptr i64, i64* [[TMP15]], i32 1 -// CHECK4-NEXT: [[TMP21:%.*]] = getelementptr i64, i64* [[TMP16]], i32 1 -// CHECK4-NEXT: [[TMP22:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i8* -// CHECK4-NEXT: store i8* [[TMP22]], i8** [[TMP11]], align 4 -// CHECK4-NEXT: [[TMP23:%.*]] = icmp eq i16 [[TMP8]], 0 -// CHECK4-NEXT: [[TMP24:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK4-NEXT: [[TMP25:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] -// CHECK4-NEXT: [[TMP26:%.*]] = and i1 [[TMP24]], [[TMP25]] -// CHECK4-NEXT: [[TMP27:%.*]] = icmp eq i16 [[TMP8]], 2 -// CHECK4-NEXT: [[TMP28:%.*]] = and i16 [[TMP6]], 1 -// CHECK4-NEXT: [[TMP29:%.*]] = icmp eq i16 [[TMP28]], 0 -// CHECK4-NEXT: [[TMP30:%.*]] = and i1 [[TMP27]], [[TMP29]] -// CHECK4-NEXT: [[TMP31:%.*]] = icmp sgt i16 [[TMP7]], 0 -// CHECK4-NEXT: [[TMP32:%.*]] = and i1 [[TMP30]], [[TMP31]] -// CHECK4-NEXT: [[TMP33:%.*]] = or i1 [[TMP23]], [[TMP26]] -// CHECK4-NEXT: [[TMP34:%.*]] = or i1 [[TMP33]], [[TMP32]] -// CHECK4-NEXT: br i1 [[TMP34]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK4: then: -// CHECK4-NEXT: [[TMP35:%.*]] = bitcast [1 x i8*]* [[TMP5]] to i8* -// CHECK4-NEXT: [[TMP36:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* -// CHECK4-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP35]], i8* [[TMP36]]) #[[ATTR3]] +// CHECK4-NEXT: [[TMP15:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK4-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP14]], i16 [[TMP7]], i16 [[TMP15]]) +// CHECK4-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8 +// CHECK4-NEXT: store i8 [[TMP17]], i8* [[DOTOMP_REDUCTION_ELEMENT]], align 1 +// CHECK4-NEXT: [[TMP18:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1 +// CHECK4-NEXT: [[TMP19:%.*]] = getelementptr i8, i8* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 +// CHECK4-NEXT: store i8* [[DOTOMP_REDUCTION_ELEMENT]], i8** [[TMP11]], align 4 +// CHECK4-NEXT: [[TMP20:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP21:%.*]] = load i8*, i8** [[TMP20]], align 4 +// CHECK4-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP23:%.*]] = bitcast i8* [[TMP21]] to float* +// CHECK4-NEXT: [[TMP24:%.*]] = getelementptr float, float* [[TMP23]], i32 1 +// CHECK4-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to i8* +// CHECK4-NEXT: [[TMP26:%.*]] = bitcast float* [[TMP23]] to i32* +// CHECK4-NEXT: [[TMP27:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i32* +// CHECK4-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK4-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK4-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 +// CHECK4-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) +// CHECK4-NEXT: store i32 [[TMP30]], i32* [[TMP27]], align 4 +// CHECK4-NEXT: [[TMP31:%.*]] = getelementptr i32, i32* [[TMP26]], i32 1 +// CHECK4-NEXT: [[TMP32:%.*]] = getelementptr i32, i32* [[TMP27]], i32 1 +// CHECK4-NEXT: [[TMP33:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* +// CHECK4-NEXT: store i8* [[TMP33]], i8** [[TMP22]], align 4 +// CHECK4-NEXT: [[TMP34:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK4-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK4-NEXT: [[TMP36:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK4-NEXT: [[TMP37:%.*]] = and i1 [[TMP35]], [[TMP36]] +// CHECK4-NEXT: [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK4-NEXT: [[TMP39:%.*]] = and i16 [[TMP6]], 1 +// CHECK4-NEXT: [[TMP40:%.*]] = icmp eq i16 [[TMP39]], 0 +// CHECK4-NEXT: [[TMP41:%.*]] = and i1 [[TMP38]], [[TMP40]] +// CHECK4-NEXT: [[TMP42:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK4-NEXT: [[TMP43:%.*]] = and i1 [[TMP41]], [[TMP42]] +// CHECK4-NEXT: [[TMP44:%.*]] = or i1 [[TMP34]], [[TMP37]] +// CHECK4-NEXT: [[TMP45:%.*]] = or i1 [[TMP44]], [[TMP43]] +// CHECK4-NEXT: br i1 [[TMP45]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK4: then: +// CHECK4-NEXT: [[TMP46:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* +// CHECK4-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK4-NEXT: call void @"_omp$reduction$reduction_func4"(i8* [[TMP46]], i8* [[TMP47]]) #[[ATTR3]] // CHECK4-NEXT: br label [[IFCONT:%.*]] // CHECK4: else: // CHECK4-NEXT: br label [[IFCONT]] // CHECK4: ifcont: -// CHECK4-NEXT: [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK4-NEXT: [[TMP38:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] -// CHECK4-NEXT: [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]] -// CHECK4-NEXT: br i1 [[TMP39]], label [[THEN4:%.*]], label [[ELSE5:%.*]] -// CHECK4: then4: -// CHECK4-NEXT: [[TMP40:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 -// CHECK4-NEXT: [[TMP41:%.*]] = load i8*, i8** [[TMP40]], align 4 -// CHECK4-NEXT: [[TMP42:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0 -// CHECK4-NEXT: [[TMP43:%.*]] = load i8*, i8** [[TMP42]], align 4 -// CHECK4-NEXT: [[TMP44:%.*]] = bitcast i8* [[TMP41]] to double* -// CHECK4-NEXT: [[TMP45:%.*]] = bitcast i8* [[TMP43]] to double* -// CHECK4-NEXT: [[TMP46:%.*]] = load double, double* [[TMP44]], align 8 -// CHECK4-NEXT: store double [[TMP46]], double* [[TMP45]], align 8 -// CHECK4-NEXT: br label [[IFCONT6:%.*]] -// CHECK4: else5: -// CHECK4-NEXT: br label [[IFCONT6]] -// CHECK4: ifcont6: +// CHECK4-NEXT: [[TMP48:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK4-NEXT: [[TMP49:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK4-NEXT: [[TMP50:%.*]] = and i1 [[TMP48]], [[TMP49]] +// CHECK4-NEXT: br i1 [[TMP50]], label [[THEN6:%.*]], label [[ELSE7:%.*]] +// CHECK4: then6: +// CHECK4-NEXT: [[TMP51:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP52:%.*]] = load i8*, i8** [[TMP51]], align 4 +// CHECK4-NEXT: [[TMP53:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP54:%.*]] = load i8*, i8** [[TMP53]], align 4 +// CHECK4-NEXT: [[TMP55:%.*]] = load i8, i8* [[TMP52]], align 1 +// CHECK4-NEXT: store i8 [[TMP55]], i8* [[TMP54]], align 1 +// CHECK4-NEXT: [[TMP56:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP57:%.*]] = load i8*, i8** [[TMP56]], align 4 +// CHECK4-NEXT: [[TMP58:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP59:%.*]] = load i8*, i8** [[TMP58]], align 4 +// CHECK4-NEXT: [[TMP60:%.*]] = bitcast i8* [[TMP57]] to float* +// CHECK4-NEXT: [[TMP61:%.*]] = bitcast i8* [[TMP59]] to float* +// CHECK4-NEXT: [[TMP62:%.*]] = load float, float* [[TMP60]], align 4 +// CHECK4-NEXT: store float [[TMP62]], float* [[TMP61]], align 4 +// CHECK4-NEXT: br label [[IFCONT8:%.*]] +// CHECK4: else7: +// CHECK4-NEXT: br label [[IFCONT8]] +// CHECK4: ifcont8: // CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func +// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func6 // CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { // CHECK4-NEXT: entry: // CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 // CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTCNT_ADDR:%.*]] = alloca i32, align 4 // CHECK4-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) // CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 // CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 @@ -4621,53 +655,67 @@ // CHECK4-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() // CHECK4-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 // CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* -// CHECK4-NEXT: store i32 0, i32* [[DOTCNT_ADDR]], align 4 -// CHECK4-NEXT: br label [[PRECOND:%.*]] -// CHECK4: precond: -// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCNT_ADDR]], align 4 -// CHECK4-NEXT: [[TMP6:%.*]] = icmp ult i32 [[TMP5]], 2 -// CHECK4-NEXT: br i1 [[TMP6]], label [[BODY:%.*]], label [[EXIT:%.*]] -// CHECK4: body: -// CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP2]]) +// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) // CHECK4-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 // CHECK4-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] // CHECK4: then: -// CHECK4-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK4-NEXT: [[TMP8:%.*]] = load i8*, i8** [[TMP7]], align 4 -// CHECK4-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32* -// CHECK4-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[TMP9]], i32 [[TMP5]] -// CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 -// CHECK4-NEXT: store volatile i32 [[TMP12]], i32 addrspace(3)* [[TMP11]], align 4 +// CHECK4-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 +// CHECK4-NEXT: [[TMP7:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK4-NEXT: [[TMP8:%.*]] = bitcast i32 addrspace(3)* [[TMP7]] to i8 addrspace(3)* +// CHECK4-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP6]], align 1 +// CHECK4-NEXT: store volatile i8 [[TMP9]], i8 addrspace(3)* [[TMP8]], align 1 // CHECK4-NEXT: br label [[IFCONT:%.*]] // CHECK4: else: // CHECK4-NEXT: br label [[IFCONT]] // CHECK4: ifcont: // CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) -// CHECK4-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK4-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP13]] +// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] // CHECK4-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] // CHECK4: then4: -// CHECK4-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK4-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK4-NEXT: [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 4 -// CHECK4-NEXT: [[TMP17:%.*]] = bitcast i8* [[TMP16]] to i32* -// CHECK4-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP17]], i32 [[TMP5]] -// CHECK4-NEXT: [[TMP19:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4 -// CHECK4-NEXT: store i32 [[TMP19]], i32* [[TMP18]], align 4 +// CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK4-NEXT: [[TMP12:%.*]] = bitcast i32 addrspace(3)* [[TMP11]] to i8 addrspace(3)* +// CHECK4-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 +// CHECK4-NEXT: [[TMP15:%.*]] = load volatile i8, i8 addrspace(3)* [[TMP12]], align 1 +// CHECK4-NEXT: store i8 [[TMP15]], i8* [[TMP14]], align 1 // CHECK4-NEXT: br label [[IFCONT6:%.*]] // CHECK4: else5: // CHECK4-NEXT: br label [[IFCONT6]] // CHECK4: ifcont6: -// CHECK4-NEXT: [[TMP20:%.*]] = add nsw i32 [[TMP5]], 1 -// CHECK4-NEXT: store i32 [[TMP20]], i32* [[DOTCNT_ADDR]], align 4 -// CHECK4-NEXT: br label [[PRECOND]] -// CHECK4: exit: +// CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK4-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK4-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] +// CHECK4: then8: +// CHECK4-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 +// CHECK4-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i32* +// CHECK4-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK4-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP18]], align 4 +// CHECK4-NEXT: store volatile i32 [[TMP20]], i32 addrspace(3)* [[TMP19]], align 4 +// CHECK4-NEXT: br label [[IFCONT10:%.*]] +// CHECK4: else9: +// CHECK4-NEXT: br label [[IFCONT10]] +// CHECK4: ifcont10: +// CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK4-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP21]] +// CHECK4-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] +// CHECK4: then12: +// CHECK4-NEXT: [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK4-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP24:%.*]] = load i8*, i8** [[TMP23]], align 4 +// CHECK4-NEXT: [[TMP25:%.*]] = bitcast i8* [[TMP24]] to i32* +// CHECK4-NEXT: [[TMP26:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP22]], align 4 +// CHECK4-NEXT: store i32 [[TMP26]], i32* [[TMP25]], align 4 +// CHECK4-NEXT: br label [[IFCONT14:%.*]] +// CHECK4: else13: +// CHECK4-NEXT: br label [[IFCONT14]] +// CHECK4: ifcont14: // CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func +// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func7 // CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { // CHECK4-NEXT: entry: // CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 @@ -4677,45 +725,51 @@ // CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 // CHECK4-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 // CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* +// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* // CHECK4-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK4-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0* +// CHECK4-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.3* // CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK4-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 // CHECK4-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 -// CHECK4-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double* -// CHECK4-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0 -// CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP7]] -// CHECK4-NEXT: [[TMP12:%.*]] = load double, double* [[TMP10]], align 8 -// CHECK4-NEXT: store double [[TMP12]], double* [[TMP11]], align 128 +// CHECK4-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP7]] +// CHECK4-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]], align 1 +// CHECK4-NEXT: store i8 [[TMP11]], i8* [[TMP10]], align 128 +// CHECK4-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 +// CHECK4-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float* +// CHECK4-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP15:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP7]] +// CHECK4-NEXT: [[TMP16:%.*]] = load float, float* [[TMP14]], align 4 +// CHECK4-NEXT: store float [[TMP16]], float* [[TMP15]], align 128 // CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func +// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func8 // CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { // CHECK4-NEXT: entry: // CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 // CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 // CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK4-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 +// CHECK4-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 // CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 // CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 // CHECK4-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 // CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0* +// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.3* // CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK4-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK4-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0 -// CHECK4-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP5]] -// CHECK4-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8* -// CHECK4-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 -// CHECK4-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK4-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK4-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP9]], i8* [[TMP10]]) #[[ATTR3]] +// CHECK4-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK4-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP5]] +// CHECK4-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 +// CHECK4-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK4-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP5]] +// CHECK4-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8* +// CHECK4-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 4 +// CHECK4-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK4-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK4-NEXT: call void @"_omp$reduction$reduction_func4"(i8* [[TMP11]], i8* [[TMP12]]) #[[ATTR3]] // CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func +// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func9 // CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { // CHECK4-NEXT: entry: // CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 @@ -4725,202 +779,211 @@ // CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 // CHECK4-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 // CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* +// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* // CHECK4-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK4-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0* +// CHECK4-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.3* // CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK4-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 // CHECK4-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 -// CHECK4-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double* -// CHECK4-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0 -// CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP7]] -// CHECK4-NEXT: [[TMP12:%.*]] = load double, double* [[TMP11]], align 128 -// CHECK4-NEXT: store double [[TMP12]], double* [[TMP10]], align 8 -// CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func -// CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK4-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 -// CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK4-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0* -// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK4-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK4-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0 -// CHECK4-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP5]] -// CHECK4-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8* -// CHECK4-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 -// CHECK4-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK4-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK4-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP10]], i8* [[TMP9]]) #[[ATTR3]] -// CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29_worker -// CHECK4-SAME: () #[[ATTR0]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 -// CHECK4-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 -// CHECK4-NEXT: store i8* null, i8** [[WORK_FN]], align 4 -// CHECK4-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 -// CHECK4-NEXT: br label [[DOTAWAIT_WORK:%.*]] -// CHECK4: .await.work: -// CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK4-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) -// CHECK4-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 -// CHECK4-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 -// CHECK4-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 -// CHECK4-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null -// CHECK4-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] -// CHECK4: .select.workers: -// CHECK4-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 -// CHECK4-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK4-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] -// CHECK4: .execute.parallel: -// CHECK4-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK4-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* -// CHECK4-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) -// CHECK4-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] -// CHECK4: .terminate.parallel: -// CHECK4-NEXT: call void @__kmpc_kernel_end_parallel() -// CHECK4-NEXT: br label [[DOTBARRIER_PARALLEL]] -// CHECK4: .barrier.parallel: -// CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK4-NEXT: br label [[DOTAWAIT_WORK]] -// CHECK4: .exit: +// CHECK4-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP7]] +// CHECK4-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP10]], align 128 +// CHECK4-NEXT: store i8 [[TMP11]], i8* [[TMP9]], align 1 +// CHECK4-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 +// CHECK4-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float* +// CHECK4-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP15:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP7]] +// CHECK4-NEXT: [[TMP16:%.*]] = load float, float* [[TMP15]], align 128 +// CHECK4-NEXT: store float [[TMP16]], float* [[TMP14]], align 4 // CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29 -// CHECK4-SAME: (i32 [[C:%.*]], i32 [[D:%.*]]) #[[ATTR1]] { +// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func10 +// CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { // CHECK4-NEXT: entry: -// CHECK4-NEXT: [[C_ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[D_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK4-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.3* +// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK4-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP5]] +// CHECK4-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 +// CHECK4-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK4-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP5]] +// CHECK4-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8* +// CHECK4-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 4 +// CHECK4-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK4-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK4-NEXT: call void @"_omp$reduction$reduction_func4"(i8* [[TMP12]], i8* [[TMP11]]) #[[ATTR3]] +// CHECK4-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l36 +// CHECK4-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 // CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 // CHECK4-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 // CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK4-NEXT: store i32 [[C]], i32* [[C_ADDR]], align 4 -// CHECK4-NEXT: store i32 [[D]], i32* [[D_ADDR]], align 4 -// CHECK4-NEXT: [[CONV:%.*]] = bitcast i32* [[C_ADDR]] to i8* -// CHECK4-NEXT: [[CONV1:%.*]] = bitcast i32* [[D_ADDR]] to float* -// CHECK4-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK4-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[B]], i32* [[B_ADDR]], align 4 +// CHECK4-NEXT: [[CONV:%.*]] = bitcast i32* [[B_ADDR]] to i16* // CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK4-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK4-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] -// CHECK4-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] -// CHECK4-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] -// CHECK4: .worker: -// CHECK4-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29_worker() #[[ATTR3]] +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) +// CHECK4-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() +// CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK4: .execute: +// CHECK4-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK4-NEXT: store i32 [[TMP0]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK4-NEXT: call void @__omp_outlined__11(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[A_ADDR]], i16* [[CONV]]) #[[ATTR3]] +// CHECK4-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK4: .omp.deinit: +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) // CHECK4-NEXT: br label [[DOTEXIT:%.*]] -// CHECK4: .mastercheck: -// CHECK4-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK4-NEXT: [[NVPTX_NUM_THREADS3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK4-NEXT: [[NVPTX_WARP_SIZE4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK4-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE4]], 1 -// CHECK4-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS3]], 1 -// CHECK4-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 -// CHECK4-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] -// CHECK4-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID2]], [[MASTER_TID]] -// CHECK4-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] -// CHECK4: .master: -// CHECK4-NEXT: [[NVPTX_NUM_THREADS5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK4-NEXT: [[NVPTX_WARP_SIZE6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK4-NEXT: [[THREAD_LIMIT7:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS5]], [[NVPTX_WARP_SIZE6]] -// CHECK4-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT7]], i16 1) -// CHECK4-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK4-NEXT: [[TMP5:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared1", align 2 -// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* @"_openmp_static_kernel$size2", align 4 -// CHECK4-NEXT: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP6]], i16 [[TMP5]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) -// CHECK4-NEXT: [[TMP7:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4 -// CHECK4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP7]], i32 0 -// CHECK4-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to %struct._globalized_locals_ty.1* -// CHECK4-NEXT: [[TMP10:%.*]] = load i8, i8* [[CONV]], align 4 -// CHECK4-NEXT: [[C8:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP9]], i32 0, i32 1 -// CHECK4-NEXT: store i8 [[TMP10]], i8* [[C8]], align 4 -// CHECK4-NEXT: [[TMP11:%.*]] = load float, float* [[CONV1]], align 4 -// CHECK4-NEXT: [[D9:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP9]], i32 0, i32 0 -// CHECK4-NEXT: store float [[TMP11]], float* [[D9]], align 4 -// CHECK4-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK4-NEXT: store i32 [[TMP12]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK4-NEXT: call void @__omp_outlined__3(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8* [[C8]], float* [[D9]]) #[[ATTR3]] -// CHECK4-NEXT: [[TMP13:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared1", align 2 -// CHECK4-NEXT: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP13]]) -// CHECK4-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] -// CHECK4: .termination.notifier: -// CHECK4-NEXT: call void @__kmpc_kernel_deinit(i16 1) -// CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK4-NEXT: br label [[DOTEXIT]] // CHECK4: .exit: // CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__3 -// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8* nonnull align 1 dereferenceable(1) [[C:%.*]], float* nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] { +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__11 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] { // CHECK4-NEXT: entry: // CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 // CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[C_ADDR:%.*]] = alloca i8*, align 4 -// CHECK4-NEXT: [[D_ADDR:%.*]] = alloca float*, align 4 +// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 4 +// CHECK4-NEXT: [[A1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[B2:%.*]] = alloca i16, align 2 +// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4 // CHECK4-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 // CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 // CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK4-NEXT: store i8* [[C]], i8** [[C_ADDR]], align 4 -// CHECK4-NEXT: store float* [[D]], float** [[D_ADDR]], align 4 -// CHECK4-NEXT: [[TMP0:%.*]] = load i8*, i8** [[C_ADDR]], align 4 -// CHECK4-NEXT: [[TMP1:%.*]] = load float*, float** [[D_ADDR]], align 4 -// CHECK4-NEXT: [[TMP2:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4 -// CHECK4-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 8 -// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.2* -// CHECK4-NEXT: [[C1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], %struct._globalized_locals_ty.2* [[TMP4]], i32 0, i32 1 -// CHECK4-NEXT: [[D2:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], %struct._globalized_locals_ty.2* [[TMP4]], i32 0, i32 0 -// CHECK4-NEXT: store i8 0, i8* [[C1]], align 4 -// CHECK4-NEXT: store float 1.000000e+00, float* [[D2]], align 4 -// CHECK4-NEXT: [[TMP5:%.*]] = load i8, i8* [[C1]], align 4 -// CHECK4-NEXT: [[CONV:%.*]] = sext i8 [[TMP5]] to i32 -// CHECK4-NEXT: [[XOR:%.*]] = xor i32 [[CONV]], 2 -// CHECK4-NEXT: [[CONV3:%.*]] = trunc i32 [[XOR]] to i8 -// CHECK4-NEXT: store i8 [[CONV3]], i8* [[C1]], align 4 -// CHECK4-NEXT: [[TMP6:%.*]] = load float, float* [[D2]], align 4 -// CHECK4-NEXT: [[MUL:%.*]] = fmul float [[TMP6]], 3.300000e+01 -// CHECK4-NEXT: store float [[MUL]], float* [[D2]], align 4 -// CHECK4-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK4-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK4-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK4-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[A1]], align 4 +// CHECK4-NEXT: store i16 -32768, i16* [[B2]], align 2 +// CHECK4-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP3:%.*]] = bitcast i32* [[A1]] to i8* +// CHECK4-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP5:%.*]] = bitcast i16* [[B2]] to i8* +// CHECK4-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 4 +// CHECK4-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 +// CHECK4-NEXT: [[TMP8:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*)* @__omp_outlined__12 to i8*), i8* null, i8** [[TMP8]], i32 2) // CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK4-NEXT: store i8* [[C1]], i8** [[TMP9]], align 4 -// CHECK4-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 -// CHECK4-NEXT: [[TMP11:%.*]] = bitcast float* [[D2]] to i8* -// CHECK4-NEXT: store i8* [[TMP11]], i8** [[TMP10]], align 4 -// CHECK4-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK4-NEXT: [[TMP13:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 -// CHECK4-NEXT: [[TMP14:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP8]], i8* [[TMP13]], i32 2048, i8* [[TMP12]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func5, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func6, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func7, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func8, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func9, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func10) -// CHECK4-NEXT: [[TMP15:%.*]] = icmp eq i32 [[TMP14]], 1 -// CHECK4-NEXT: br i1 [[TMP15]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK4-NEXT: [[TMP10:%.*]] = bitcast i32* [[A1]] to i8* +// CHECK4-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP12:%.*]] = bitcast i16* [[B2]] to i8* +// CHECK4-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK4-NEXT: [[TMP13:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK4-NEXT: [[TMP14:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 +// CHECK4-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i8* [[TMP14]], i32 2048, i8* [[TMP13]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func17, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func18, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func19, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func20, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func21, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func22) +// CHECK4-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP15]], 1 +// CHECK4-NEXT: br i1 [[TMP16]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] // CHECK4: .omp.reduction.then: -// CHECK4-NEXT: [[TMP16:%.*]] = load i8, i8* [[TMP0]], align 1 -// CHECK4-NEXT: [[CONV4:%.*]] = sext i8 [[TMP16]] to i32 -// CHECK4-NEXT: [[TMP17:%.*]] = load i8, i8* [[C1]], align 4 -// CHECK4-NEXT: [[CONV5:%.*]] = sext i8 [[TMP17]] to i32 -// CHECK4-NEXT: [[XOR6:%.*]] = xor i32 [[CONV4]], [[CONV5]] -// CHECK4-NEXT: [[CONV7:%.*]] = trunc i32 [[XOR6]] to i8 -// CHECK4-NEXT: store i8 [[CONV7]], i8* [[TMP0]], align 1 -// CHECK4-NEXT: [[TMP18:%.*]] = load float, float* [[TMP1]], align 4 -// CHECK4-NEXT: [[TMP19:%.*]] = load float, float* [[D2]], align 4 -// CHECK4-NEXT: [[MUL8:%.*]] = fmul float [[TMP18]], [[TMP19]] -// CHECK4-NEXT: store float [[MUL8]], float* [[TMP1]], align 4 -// CHECK4-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP8]]) +// CHECK4-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK4-NEXT: [[TMP18:%.*]] = load i32, i32* [[A1]], align 4 +// CHECK4-NEXT: [[OR:%.*]] = or i32 [[TMP17]], [[TMP18]] +// CHECK4-NEXT: store i32 [[OR]], i32* [[TMP0]], align 4 +// CHECK4-NEXT: [[TMP19:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK4-NEXT: [[CONV:%.*]] = sext i16 [[TMP19]] to i32 +// CHECK4-NEXT: [[TMP20:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK4-NEXT: [[CONV3:%.*]] = sext i16 [[TMP20]] to i32 +// CHECK4-NEXT: [[CMP:%.*]] = icmp sgt i32 [[CONV]], [[CONV3]] +// CHECK4-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK4: cond.true: +// CHECK4-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK4-NEXT: br label [[COND_END:%.*]] +// CHECK4: cond.false: +// CHECK4-NEXT: [[TMP22:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK4-NEXT: br label [[COND_END]] +// CHECK4: cond.end: +// CHECK4-NEXT: [[COND:%.*]] = phi i16 [ [[TMP21]], [[COND_TRUE]] ], [ [[TMP22]], [[COND_FALSE]] ] +// CHECK4-NEXT: store i16 [[COND]], i16* [[TMP1]], align 2 +// CHECK4-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP7]]) +// CHECK4-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK4: .omp.reduction.done: +// CHECK4-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__12 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 4 +// CHECK4-NEXT: [[A1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[B2:%.*]] = alloca i16, align 2 +// CHECK4-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK4-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK4-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[A1]], align 4 +// CHECK4-NEXT: store i16 -32768, i16* [[B2]], align 2 +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[A1]], align 4 +// CHECK4-NEXT: [[OR:%.*]] = or i32 [[TMP2]], 1 +// CHECK4-NEXT: store i32 [[OR]], i32* [[A1]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK4-NEXT: [[CONV:%.*]] = sext i16 [[TMP3]] to i32 +// CHECK4-NEXT: [[CMP:%.*]] = icmp sgt i32 99, [[CONV]] +// CHECK4-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK4: cond.true: +// CHECK4-NEXT: br label [[COND_END:%.*]] +// CHECK4: cond.false: +// CHECK4-NEXT: [[TMP4:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK4-NEXT: [[CONV3:%.*]] = sext i16 [[TMP4]] to i32 +// CHECK4-NEXT: br label [[COND_END]] +// CHECK4: cond.end: +// CHECK4-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[CONV3]], [[COND_FALSE]] ] +// CHECK4-NEXT: [[CONV4:%.*]] = trunc i32 [[COND]] to i16 +// CHECK4-NEXT: store i16 [[CONV4]], i16* [[B2]], align 2 +// CHECK4-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK4-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP8:%.*]] = bitcast i32* [[A1]] to i8* +// CHECK4-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 4 +// CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP10:%.*]] = bitcast i16* [[B2]] to i8* +// CHECK4-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK4-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK4-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP6]], i32 2, i32 8, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func14, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func15) +// CHECK4-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP12]], 1 +// CHECK4-NEXT: br i1 [[TMP13]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK4: .omp.reduction.then: +// CHECK4-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[A1]], align 4 +// CHECK4-NEXT: [[OR5:%.*]] = or i32 [[TMP14]], [[TMP15]] +// CHECK4-NEXT: store i32 [[OR5]], i32* [[TMP0]], align 4 +// CHECK4-NEXT: [[TMP16:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK4-NEXT: [[CONV6:%.*]] = sext i16 [[TMP16]] to i32 +// CHECK4-NEXT: [[TMP17:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK4-NEXT: [[CONV7:%.*]] = sext i16 [[TMP17]] to i32 +// CHECK4-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[CONV6]], [[CONV7]] +// CHECK4-NEXT: br i1 [[CMP8]], label [[COND_TRUE9:%.*]], label [[COND_FALSE10:%.*]] +// CHECK4: cond.true9: +// CHECK4-NEXT: [[TMP18:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK4-NEXT: br label [[COND_END11:%.*]] +// CHECK4: cond.false10: +// CHECK4-NEXT: [[TMP19:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK4-NEXT: br label [[COND_END11]] +// CHECK4: cond.end11: +// CHECK4-NEXT: [[COND12:%.*]] = phi i16 [ [[TMP18]], [[COND_TRUE9]] ], [ [[TMP19]], [[COND_FALSE10]] ] +// CHECK4-NEXT: store i16 [[COND12]], i16* [[TMP1]], align 2 +// CHECK4-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP6]]) // CHECK4-NEXT: br label [[DOTOMP_REDUCTION_DONE]] // CHECK4: .omp.reduction.done: // CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func5 +// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func14 // CHECK4-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { // CHECK4-NEXT: entry: // CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 @@ -4928,8 +991,8 @@ // CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 // CHECK4-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 // CHECK4-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 -// CHECK4-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i8, align 1 -// CHECK4-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca float, align 4 +// CHECK4-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 // CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 // CHECK4-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 // CHECK4-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 @@ -4942,87 +1005,262 @@ // CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 // CHECK4-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 // CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 -// CHECK4-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1 -// CHECK4-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP10]], align 1 -// CHECK4-NEXT: [[TMP14:%.*]] = sext i8 [[TMP13]] to i32 +// CHECK4-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* +// CHECK4-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 +// CHECK4-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* +// CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 // CHECK4-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK4-NEXT: [[TMP15:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 -// CHECK4-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP14]], i16 [[TMP7]], i16 [[TMP15]]) -// CHECK4-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8 -// CHECK4-NEXT: store i8 [[TMP17]], i8* [[DOTOMP_REDUCTION_ELEMENT]], align 1 -// CHECK4-NEXT: [[TMP18:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1 -// CHECK4-NEXT: [[TMP19:%.*]] = getelementptr i8, i8* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 -// CHECK4-NEXT: store i8* [[DOTOMP_REDUCTION_ELEMENT]], i8** [[TMP11]], align 4 -// CHECK4-NEXT: [[TMP20:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 -// CHECK4-NEXT: [[TMP21:%.*]] = load i8*, i8** [[TMP20]], align 4 -// CHECK4-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 -// CHECK4-NEXT: [[TMP23:%.*]] = bitcast i8* [[TMP21]] to float* -// CHECK4-NEXT: [[TMP24:%.*]] = getelementptr float, float* [[TMP23]], i32 1 -// CHECK4-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to i8* -// CHECK4-NEXT: [[TMP26:%.*]] = bitcast float* [[TMP23]] to i32* -// CHECK4-NEXT: [[TMP27:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i32* -// CHECK4-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK4-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK4-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]]) +// CHECK4-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 +// CHECK4-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 +// CHECK4-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 +// CHECK4-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* +// CHECK4-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 4 +// CHECK4-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 4 +// CHECK4-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16* +// CHECK4-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 +// CHECK4-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8* +// CHECK4-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2 +// CHECK4-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32 // CHECK4-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK4-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 // CHECK4-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) -// CHECK4-NEXT: store i32 [[TMP30]], i32* [[TMP27]], align 4 -// CHECK4-NEXT: [[TMP31:%.*]] = getelementptr i32, i32* [[TMP26]], i32 1 -// CHECK4-NEXT: [[TMP32:%.*]] = getelementptr i32, i32* [[TMP27]], i32 1 -// CHECK4-NEXT: [[TMP33:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* -// CHECK4-NEXT: store i8* [[TMP33]], i8** [[TMP22]], align 4 -// CHECK4-NEXT: [[TMP34:%.*]] = icmp eq i16 [[TMP8]], 0 -// CHECK4-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK4-NEXT: [[TMP36:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] -// CHECK4-NEXT: [[TMP37:%.*]] = and i1 [[TMP35]], [[TMP36]] -// CHECK4-NEXT: [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 2 -// CHECK4-NEXT: [[TMP39:%.*]] = and i16 [[TMP6]], 1 -// CHECK4-NEXT: [[TMP40:%.*]] = icmp eq i16 [[TMP39]], 0 -// CHECK4-NEXT: [[TMP41:%.*]] = and i1 [[TMP38]], [[TMP40]] -// CHECK4-NEXT: [[TMP42:%.*]] = icmp sgt i16 [[TMP7]], 0 -// CHECK4-NEXT: [[TMP43:%.*]] = and i1 [[TMP41]], [[TMP42]] -// CHECK4-NEXT: [[TMP44:%.*]] = or i1 [[TMP34]], [[TMP37]] -// CHECK4-NEXT: [[TMP45:%.*]] = or i1 [[TMP44]], [[TMP43]] -// CHECK4-NEXT: br i1 [[TMP45]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK4-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 +// CHECK4-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 +// CHECK4-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 +// CHECK4-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1 +// CHECK4-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* +// CHECK4-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 4 +// CHECK4-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK4-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK4-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK4-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]] +// CHECK4-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK4-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1 +// CHECK4-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0 +// CHECK4-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]] +// CHECK4-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK4-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]] +// CHECK4-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]] +// CHECK4-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]] +// CHECK4-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK4: then: +// CHECK4-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* +// CHECK4-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK4-NEXT: call void @"_omp$reduction$reduction_func13"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]] +// CHECK4-NEXT: br label [[IFCONT:%.*]] +// CHECK4: else: +// CHECK4-NEXT: br label [[IFCONT]] +// CHECK4: ifcont: +// CHECK4-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK4-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK4-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]] +// CHECK4-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]] +// CHECK4: then6: +// CHECK4-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 4 +// CHECK4-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4 +// CHECK4-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32* +// CHECK4-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32* +// CHECK4-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4 +// CHECK4-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4 +// CHECK4-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 4 +// CHECK4-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4 +// CHECK4-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16* +// CHECK4-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16* +// CHECK4-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2 +// CHECK4-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2 +// CHECK4-NEXT: br label [[IFCONT8:%.*]] +// CHECK4: else7: +// CHECK4-NEXT: br label [[IFCONT8]] +// CHECK4: ifcont8: +// CHECK4-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func15 +// CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK4-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK4-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK4-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK4-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4:[0-9]+]], i32 [[TMP2]]) +// CHECK4-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK4-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK4: then: +// CHECK4-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 +// CHECK4-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* +// CHECK4-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK4-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4 +// CHECK4-NEXT: br label [[IFCONT:%.*]] +// CHECK4: else: +// CHECK4-NEXT: br label [[IFCONT]] +// CHECK4: ifcont: +// CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] +// CHECK4-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK4: then4: +// CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK4-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 +// CHECK4-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32* +// CHECK4-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4 +// CHECK4-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4 +// CHECK4-NEXT: br label [[IFCONT6:%.*]] +// CHECK4: else5: +// CHECK4-NEXT: br label [[IFCONT6]] +// CHECK4: ifcont6: +// CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK4-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK4-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] +// CHECK4: then8: +// CHECK4-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 +// CHECK4-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16* +// CHECK4-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK4-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)* +// CHECK4-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2 +// CHECK4-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2 +// CHECK4-NEXT: br label [[IFCONT10:%.*]] +// CHECK4: else9: +// CHECK4-NEXT: br label [[IFCONT10]] +// CHECK4: ifcont10: +// CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK4-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]] +// CHECK4-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] +// CHECK4: then12: +// CHECK4-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK4-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)* +// CHECK4-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 4 +// CHECK4-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16* +// CHECK4-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2 +// CHECK4-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2 +// CHECK4-NEXT: br label [[IFCONT14:%.*]] +// CHECK4: else13: +// CHECK4-NEXT: br label [[IFCONT14]] +// CHECK4: ifcont14: +// CHECK4-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func17 +// CHECK4-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK4-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK4-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK4-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 +// CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK4-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK4-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK4-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK4-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK4-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* +// CHECK4-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK4-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK4-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 +// CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* +// CHECK4-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 +// CHECK4-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* +// CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 +// CHECK4-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK4-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK4-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]]) +// CHECK4-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 +// CHECK4-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 +// CHECK4-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 +// CHECK4-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* +// CHECK4-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 4 +// CHECK4-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 4 +// CHECK4-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16* +// CHECK4-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 +// CHECK4-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8* +// CHECK4-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2 +// CHECK4-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32 +// CHECK4-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK4-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 +// CHECK4-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) +// CHECK4-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 +// CHECK4-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 +// CHECK4-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 +// CHECK4-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1 +// CHECK4-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* +// CHECK4-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 4 +// CHECK4-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK4-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK4-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK4-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]] +// CHECK4-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK4-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1 +// CHECK4-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0 +// CHECK4-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]] +// CHECK4-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK4-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]] +// CHECK4-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]] +// CHECK4-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]] +// CHECK4-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]] // CHECK4: then: -// CHECK4-NEXT: [[TMP46:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* -// CHECK4-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* -// CHECK4-NEXT: call void @"_omp$reduction$reduction_func4"(i8* [[TMP46]], i8* [[TMP47]]) #[[ATTR3]] +// CHECK4-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* +// CHECK4-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK4-NEXT: call void @"_omp$reduction$reduction_func16"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]] // CHECK4-NEXT: br label [[IFCONT:%.*]] // CHECK4: else: // CHECK4-NEXT: br label [[IFCONT]] // CHECK4: ifcont: -// CHECK4-NEXT: [[TMP48:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK4-NEXT: [[TMP49:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] -// CHECK4-NEXT: [[TMP50:%.*]] = and i1 [[TMP48]], [[TMP49]] -// CHECK4-NEXT: br i1 [[TMP50]], label [[THEN6:%.*]], label [[ELSE7:%.*]] +// CHECK4-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK4-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK4-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]] +// CHECK4-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]] // CHECK4: then6: -// CHECK4-NEXT: [[TMP51:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 -// CHECK4-NEXT: [[TMP52:%.*]] = load i8*, i8** [[TMP51]], align 4 -// CHECK4-NEXT: [[TMP53:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 -// CHECK4-NEXT: [[TMP54:%.*]] = load i8*, i8** [[TMP53]], align 4 -// CHECK4-NEXT: [[TMP55:%.*]] = load i8, i8* [[TMP52]], align 1 -// CHECK4-NEXT: store i8 [[TMP55]], i8* [[TMP54]], align 1 -// CHECK4-NEXT: [[TMP56:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 -// CHECK4-NEXT: [[TMP57:%.*]] = load i8*, i8** [[TMP56]], align 4 -// CHECK4-NEXT: [[TMP58:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 -// CHECK4-NEXT: [[TMP59:%.*]] = load i8*, i8** [[TMP58]], align 4 -// CHECK4-NEXT: [[TMP60:%.*]] = bitcast i8* [[TMP57]] to float* -// CHECK4-NEXT: [[TMP61:%.*]] = bitcast i8* [[TMP59]] to float* -// CHECK4-NEXT: [[TMP62:%.*]] = load float, float* [[TMP60]], align 4 -// CHECK4-NEXT: store float [[TMP62]], float* [[TMP61]], align 4 +// CHECK4-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 4 +// CHECK4-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4 +// CHECK4-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32* +// CHECK4-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32* +// CHECK4-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4 +// CHECK4-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4 +// CHECK4-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 4 +// CHECK4-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4 +// CHECK4-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16* +// CHECK4-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16* +// CHECK4-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2 +// CHECK4-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2 // CHECK4-NEXT: br label [[IFCONT8:%.*]] // CHECK4: else7: // CHECK4-NEXT: br label [[IFCONT8]] // CHECK4: ifcont8: // CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func6 +// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func18 // CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { // CHECK4-NEXT: entry: // CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 // CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK4-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) // CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 // CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 // CHECK4-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() @@ -5032,68 +1270,68 @@ // CHECK4-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 // CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 // CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) // CHECK4-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 // CHECK4-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] // CHECK4: then: // CHECK4-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 // CHECK4-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 -// CHECK4-NEXT: [[TMP7:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK4-NEXT: [[TMP8:%.*]] = bitcast i32 addrspace(3)* [[TMP7]] to i8 addrspace(3)* -// CHECK4-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP6]], align 1 -// CHECK4-NEXT: store volatile i8 [[TMP9]], i8 addrspace(3)* [[TMP8]], align 1 +// CHECK4-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* +// CHECK4-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK4-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4 // CHECK4-NEXT: br label [[IFCONT:%.*]] // CHECK4: else: // CHECK4-NEXT: br label [[IFCONT]] // CHECK4: ifcont: -// CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) // CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 // CHECK4-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] // CHECK4-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] // CHECK4: then4: // CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK4-NEXT: [[TMP12:%.*]] = bitcast i32 addrspace(3)* [[TMP11]] to i8 addrspace(3)* -// CHECK4-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK4-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 -// CHECK4-NEXT: [[TMP15:%.*]] = load volatile i8, i8 addrspace(3)* [[TMP12]], align 1 -// CHECK4-NEXT: store i8 [[TMP15]], i8* [[TMP14]], align 1 +// CHECK4-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 +// CHECK4-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32* +// CHECK4-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4 +// CHECK4-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4 // CHECK4-NEXT: br label [[IFCONT6:%.*]] // CHECK4: else5: // CHECK4-NEXT: br label [[IFCONT6]] // CHECK4: ifcont6: -// CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) // CHECK4-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 // CHECK4-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] // CHECK4: then8: // CHECK4-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 // CHECK4-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 -// CHECK4-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i32* +// CHECK4-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16* // CHECK4-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK4-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP18]], align 4 -// CHECK4-NEXT: store volatile i32 [[TMP20]], i32 addrspace(3)* [[TMP19]], align 4 +// CHECK4-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)* +// CHECK4-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2 +// CHECK4-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2 // CHECK4-NEXT: br label [[IFCONT10:%.*]] // CHECK4: else9: // CHECK4-NEXT: br label [[IFCONT10]] // CHECK4: ifcont10: -// CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) -// CHECK4-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK4-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP21]] +// CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK4-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]] // CHECK4-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] // CHECK4: then12: -// CHECK4-NEXT: [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK4-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK4-NEXT: [[TMP24:%.*]] = load i8*, i8** [[TMP23]], align 4 -// CHECK4-NEXT: [[TMP25:%.*]] = bitcast i8* [[TMP24]] to i32* -// CHECK4-NEXT: [[TMP26:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP22]], align 4 -// CHECK4-NEXT: store i32 [[TMP26]], i32* [[TMP25]], align 4 +// CHECK4-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK4-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)* +// CHECK4-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 4 +// CHECK4-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16* +// CHECK4-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2 +// CHECK4-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2 // CHECK4-NEXT: br label [[IFCONT14:%.*]] // CHECK4: else13: // CHECK4-NEXT: br label [[IFCONT14]] // CHECK4: ifcont14: // CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func7 +// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func19 // CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { // CHECK4-NEXT: entry: // CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 @@ -5105,25 +1343,24 @@ // CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 // CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* // CHECK4-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK4-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.3* +// CHECK4-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.4* // CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 // CHECK4-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 // CHECK4-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 -// CHECK4-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 0 -// CHECK4-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP7]] -// CHECK4-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]], align 1 -// CHECK4-NEXT: store i8 [[TMP11]], i8* [[TMP10]], align 128 -// CHECK4-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK4-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 -// CHECK4-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float* -// CHECK4-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 1 -// CHECK4-NEXT: [[TMP15:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP7]] -// CHECK4-NEXT: [[TMP16:%.*]] = load float, float* [[TMP14]], align 4 -// CHECK4-NEXT: store float [[TMP16]], float* [[TMP15]], align 128 +// CHECK4-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* +// CHECK4-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP7]] +// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK4-NEXT: store i32 [[TMP12]], i32* [[TMP11]], align 128 +// CHECK4-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 +// CHECK4-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16* +// CHECK4-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP7]] +// CHECK4-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP15]], align 2 +// CHECK4-NEXT: store i16 [[TMP17]], i16* [[TMP16]], align 128 // CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func8 +// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func20 // CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { // CHECK4-NEXT: entry: // CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 @@ -5134,24 +1371,23 @@ // CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 // CHECK4-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 // CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.3* +// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.4* // CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 // CHECK4-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK4-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 0 -// CHECK4-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP5]] -// CHECK4-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 -// CHECK4-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 -// CHECK4-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 1 -// CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP5]] -// CHECK4-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8* -// CHECK4-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 4 -// CHECK4-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK4-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK4-NEXT: call void @"_omp$reduction$reduction_func4"(i8* [[TMP11]], i8* [[TMP12]]) #[[ATTR3]] +// CHECK4-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP5]] +// CHECK4-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* +// CHECK4-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 +// CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK4-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP5]] +// CHECK4-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8* +// CHECK4-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 4 +// CHECK4-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK4-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK4-NEXT: call void @"_omp$reduction$reduction_func16"(i8* [[TMP12]], i8* [[TMP13]]) #[[ATTR3]] // CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func9 +// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func21 // CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { // CHECK4-NEXT: entry: // CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 @@ -5163,25 +1399,24 @@ // CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 // CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* // CHECK4-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK4-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.3* +// CHECK4-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.4* // CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 // CHECK4-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 // CHECK4-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 -// CHECK4-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 0 -// CHECK4-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP7]] -// CHECK4-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP10]], align 128 -// CHECK4-NEXT: store i8 [[TMP11]], i8* [[TMP9]], align 1 -// CHECK4-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK4-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 -// CHECK4-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float* -// CHECK4-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 1 -// CHECK4-NEXT: [[TMP15:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP7]] -// CHECK4-NEXT: [[TMP16:%.*]] = load float, float* [[TMP15]], align 128 -// CHECK4-NEXT: store float [[TMP16]], float* [[TMP14]], align 4 +// CHECK4-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* +// CHECK4-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP7]] +// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 128 +// CHECK4-NEXT: store i32 [[TMP12]], i32* [[TMP10]], align 4 +// CHECK4-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 +// CHECK4-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16* +// CHECK4-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP7]] +// CHECK4-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP16]], align 128 +// CHECK4-NEXT: store i16 [[TMP17]], i16* [[TMP15]], align 2 // CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func10 +// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func22 // CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { // CHECK4-NEXT: entry: // CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 @@ -5192,3541 +1427,7093 @@ // CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 // CHECK4-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 // CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.3* +// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.4* // CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 // CHECK4-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK4-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 0 -// CHECK4-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP5]] -// CHECK4-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 -// CHECK4-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 -// CHECK4-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 1 -// CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP5]] -// CHECK4-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8* -// CHECK4-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 4 -// CHECK4-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK4-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK4-NEXT: call void @"_omp$reduction$reduction_func4"(i8* [[TMP12]], i8* [[TMP11]]) #[[ATTR3]] +// CHECK4-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP5]] +// CHECK4-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* +// CHECK4-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 +// CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK4-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP5]] +// CHECK4-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8* +// CHECK4-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 4 +// CHECK4-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK4-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK4-NEXT: call void @"_omp$reduction$reduction_func16"(i8* [[TMP13]], i8* [[TMP12]]) #[[ATTR3]] // CHECK4-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_worker +// CHECK5-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK5-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK5-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK5-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK5: .await.work: +// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK5-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK5-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK5-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK5-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK5-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK5-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK5: .select.workers: +// CHECK5-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK5-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK5-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK5: .execute.parallel: +// CHECK5-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK5-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK5-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK5: .terminate.parallel: +// CHECK5-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK5-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK5: .barrier.parallel: +// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK5-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK5: .exit: +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23 +// CHECK5-SAME: (double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1:[0-9]+]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[E_ADDR:%.*]] = alloca double*, align 4 +// CHECK5-NEXT: [[E7:%.*]] = alloca double, align 8 +// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK5-NEXT: store double* [[E]], double** [[E_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4 +// CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK5-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK5-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK5-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK5: .worker: +// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_worker() #[[ATTR3:[0-9]+]] +// CHECK5-NEXT: br label [[DOTEXIT:%.*]] +// CHECK5: .mastercheck: +// CHECK5-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK5-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK5-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK5-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1 +// CHECK5-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]] +// CHECK5-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK5-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK5: .master: +// CHECK5-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK5-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK5-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK5-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK5-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK5-NEXT: [[TMP7:%.*]] = load double, double* [[TMP0]], align 8 +// CHECK5-NEXT: store double [[TMP7]], double* [[E7]], align 8 +// CHECK5-NEXT: store i32 [[TMP6]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK5-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], double* [[E7]]) #[[ATTR3]] +// CHECK5-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK5: .termination.notifier: +// CHECK5-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK5-NEXT: br label [[DOTEXIT]] +// CHECK5: .exit: +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[E_ADDR:%.*]] = alloca double*, align 4 +// CHECK5-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK5-NEXT: store double* [[E]], double** [[E_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4 +// CHECK5-NEXT: [[TMP1:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 8, i16 1) +// CHECK5-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to %struct._globalized_locals_ty* +// CHECK5-NEXT: [[E1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP2]], i32 0, i32 0 +// CHECK5-NEXT: store double 0.000000e+00, double* [[E1]], align 8 +// CHECK5-NEXT: [[TMP3:%.*]] = load double, double* [[E1]], align 8 +// CHECK5-NEXT: [[ADD:%.*]] = fadd double [[TMP3]], 5.000000e+00 +// CHECK5-NEXT: store double [[ADD]], double* [[E1]], align 8 +// CHECK5-NEXT: [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP7:%.*]] = bitcast double* [[E1]] to i8* +// CHECK5-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK5-NEXT: [[TMP9:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 +// CHECK5-NEXT: [[TMP10:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i8* [[TMP9]], i32 1024, i8* [[TMP8]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func) +// CHECK5-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP10]], 1 +// CHECK5-NEXT: br i1 [[TMP11]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK5: .omp.reduction.then: +// CHECK5-NEXT: [[TMP12:%.*]] = load double, double* [[TMP0]], align 8 +// CHECK5-NEXT: [[TMP13:%.*]] = load double, double* [[E1]], align 8 +// CHECK5-NEXT: [[ADD2:%.*]] = fadd double [[TMP12]], [[TMP13]] +// CHECK5-NEXT: store double [[ADD2]], double* [[TMP0]], align 8 +// CHECK5-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]]) +// CHECK5-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK5: .omp.reduction.done: +// CHECK5-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP1]]) +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func +// CHECK5-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK5-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK5-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x i8*], align 4 +// CHECK5-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca double, align 8 +// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK5-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK5-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK5-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [1 x i8*]* +// CHECK5-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK5-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK5-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 +// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to double* +// CHECK5-NEXT: [[TMP13:%.*]] = getelementptr double, double* [[TMP12]], i32 1 +// CHECK5-NEXT: [[TMP14:%.*]] = bitcast double* [[TMP13]] to i8* +// CHECK5-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP12]] to i64* +// CHECK5-NEXT: [[TMP16:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i64* +// CHECK5-NEXT: [[TMP17:%.*]] = load i64, i64* [[TMP15]], align 8 +// CHECK5-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK5-NEXT: [[TMP18:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK5-NEXT: [[TMP19:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP17]], i16 [[TMP7]], i16 [[TMP18]]) +// CHECK5-NEXT: store i64 [[TMP19]], i64* [[TMP16]], align 8 +// CHECK5-NEXT: [[TMP20:%.*]] = getelementptr i64, i64* [[TMP15]], i32 1 +// CHECK5-NEXT: [[TMP21:%.*]] = getelementptr i64, i64* [[TMP16]], i32 1 +// CHECK5-NEXT: [[TMP22:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i8* +// CHECK5-NEXT: store i8* [[TMP22]], i8** [[TMP11]], align 4 +// CHECK5-NEXT: [[TMP23:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK5-NEXT: [[TMP24:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK5-NEXT: [[TMP25:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK5-NEXT: [[TMP26:%.*]] = and i1 [[TMP24]], [[TMP25]] +// CHECK5-NEXT: [[TMP27:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK5-NEXT: [[TMP28:%.*]] = and i16 [[TMP6]], 1 +// CHECK5-NEXT: [[TMP29:%.*]] = icmp eq i16 [[TMP28]], 0 +// CHECK5-NEXT: [[TMP30:%.*]] = and i1 [[TMP27]], [[TMP29]] +// CHECK5-NEXT: [[TMP31:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK5-NEXT: [[TMP32:%.*]] = and i1 [[TMP30]], [[TMP31]] +// CHECK5-NEXT: [[TMP33:%.*]] = or i1 [[TMP23]], [[TMP26]] +// CHECK5-NEXT: [[TMP34:%.*]] = or i1 [[TMP33]], [[TMP32]] +// CHECK5-NEXT: br i1 [[TMP34]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK5: then: +// CHECK5-NEXT: [[TMP35:%.*]] = bitcast [1 x i8*]* [[TMP5]] to i8* +// CHECK5-NEXT: [[TMP36:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK5-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP35]], i8* [[TMP36]]) #[[ATTR3]] +// CHECK5-NEXT: br label [[IFCONT:%.*]] +// CHECK5: else: +// CHECK5-NEXT: br label [[IFCONT]] +// CHECK5: ifcont: +// CHECK5-NEXT: [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK5-NEXT: [[TMP38:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK5-NEXT: [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]] +// CHECK5-NEXT: br i1 [[TMP39]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK5: then4: +// CHECK5-NEXT: [[TMP40:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP41:%.*]] = load i8*, i8** [[TMP40]], align 4 +// CHECK5-NEXT: [[TMP42:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP43:%.*]] = load i8*, i8** [[TMP42]], align 4 +// CHECK5-NEXT: [[TMP44:%.*]] = bitcast i8* [[TMP41]] to double* +// CHECK5-NEXT: [[TMP45:%.*]] = bitcast i8* [[TMP43]] to double* +// CHECK5-NEXT: [[TMP46:%.*]] = load double, double* [[TMP44]], align 8 +// CHECK5-NEXT: store double [[TMP46]], double* [[TMP45]], align 8 +// CHECK5-NEXT: br label [[IFCONT6:%.*]] +// CHECK5: else5: +// CHECK5-NEXT: br label [[IFCONT6]] +// CHECK5: ifcont6: +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func +// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCNT_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK5-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* +// CHECK5-NEXT: store i32 0, i32* [[DOTCNT_ADDR]], align 4 +// CHECK5-NEXT: br label [[PRECOND:%.*]] +// CHECK5: precond: +// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCNT_ADDR]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = icmp ult i32 [[TMP5]], 2 +// CHECK5-NEXT: br i1 [[TMP6]], label [[BODY:%.*]], label [[EXIT:%.*]] +// CHECK5: body: +// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP2]]) +// CHECK5-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK5-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK5: then: +// CHECK5-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP8:%.*]] = load i8*, i8** [[TMP7]], align 4 +// CHECK5-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32* +// CHECK5-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[TMP9]], i32 [[TMP5]] +// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK5-NEXT: store volatile i32 [[TMP12]], i32 addrspace(3)* [[TMP11]], align 4 +// CHECK5-NEXT: br label [[IFCONT:%.*]] +// CHECK5: else: +// CHECK5-NEXT: br label [[IFCONT]] +// CHECK5: ifcont: +// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK5-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP13]] +// CHECK5-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK5: then4: +// CHECK5-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK5-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 4 +// CHECK5-NEXT: [[TMP17:%.*]] = bitcast i8* [[TMP16]] to i32* +// CHECK5-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP17]], i32 [[TMP5]] +// CHECK5-NEXT: [[TMP19:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4 +// CHECK5-NEXT: store i32 [[TMP19]], i32* [[TMP18]], align 4 +// CHECK5-NEXT: br label [[IFCONT6:%.*]] +// CHECK5: else5: +// CHECK5-NEXT: br label [[IFCONT6]] +// CHECK5: ifcont6: +// CHECK5-NEXT: [[TMP20:%.*]] = add nsw i32 [[TMP5]], 1 +// CHECK5-NEXT: store i32 [[TMP20]], i32* [[DOTCNT_ADDR]], align 4 +// CHECK5-NEXT: br label [[PRECOND]] +// CHECK5: exit: +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func +// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* +// CHECK5-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0* +// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK5-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double* +// CHECK5-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP7]] +// CHECK5-NEXT: [[TMP12:%.*]] = load double, double* [[TMP10]], align 8 +// CHECK5-NEXT: store double [[TMP12]], double* [[TMP11]], align 128 +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func +// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 +// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0* +// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK5-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP5]] +// CHECK5-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8* +// CHECK5-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 +// CHECK5-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK5-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK5-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP9]], i8* [[TMP10]]) #[[ATTR3]] +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func +// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* +// CHECK5-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0* +// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK5-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double* +// CHECK5-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP7]] +// CHECK5-NEXT: [[TMP12:%.*]] = load double, double* [[TMP11]], align 128 +// CHECK5-NEXT: store double [[TMP12]], double* [[TMP10]], align 8 +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func +// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 +// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0* +// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK5-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP5]] +// CHECK5-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8* +// CHECK5-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 +// CHECK5-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK5-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK5-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP10]], i8* [[TMP9]]) #[[ATTR3]] +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29_worker +// CHECK5-SAME: () #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK5-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK5-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK5-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK5: .await.work: +// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK5-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK5-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK5-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK5-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK5-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK5-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK5: .select.workers: +// CHECK5-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK5-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK5-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK5: .execute.parallel: +// CHECK5-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK5-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK5-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK5: .terminate.parallel: +// CHECK5-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK5-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK5: .barrier.parallel: +// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK5-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK5: .exit: +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29 +// CHECK5-SAME: (i32 [[C:%.*]], i32 [[D:%.*]]) #[[ATTR1]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[C_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[D_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[C]], i32* [[C_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[D]], i32* [[D_ADDR]], align 4 +// CHECK5-NEXT: [[CONV:%.*]] = bitcast i32* [[C_ADDR]] to i8* +// CHECK5-NEXT: [[CONV1:%.*]] = bitcast i32* [[D_ADDR]] to float* +// CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK5-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK5-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK5-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK5: .worker: +// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29_worker() #[[ATTR3]] +// CHECK5-NEXT: br label [[DOTEXIT:%.*]] +// CHECK5: .mastercheck: +// CHECK5-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_NUM_THREADS3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: [[NVPTX_WARP_SIZE4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK5-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE4]], 1 +// CHECK5-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS3]], 1 +// CHECK5-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK5-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK5-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID2]], [[MASTER_TID]] +// CHECK5-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK5: .master: +// CHECK5-NEXT: [[NVPTX_NUM_THREADS5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: [[NVPTX_WARP_SIZE6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK5-NEXT: [[THREAD_LIMIT7:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS5]], [[NVPTX_WARP_SIZE6]] +// CHECK5-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT7]], i16 1) +// CHECK5-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK5-NEXT: [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 8, i16 1) +// CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.1* +// CHECK5-NEXT: [[TMP7:%.*]] = load i8, i8* [[CONV]], align 4 +// CHECK5-NEXT: [[C8:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 1 +// CHECK5-NEXT: store i8 [[TMP7]], i8* [[C8]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = load float, float* [[CONV1]], align 4 +// CHECK5-NEXT: [[D9:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 0 +// CHECK5-NEXT: store float [[TMP8]], float* [[D9]], align 4 +// CHECK5-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK5-NEXT: store i32 [[TMP9]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK5-NEXT: call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8* [[C8]], float* [[D9]]) #[[ATTR3]] +// CHECK5-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]]) +// CHECK5-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK5: .termination.notifier: +// CHECK5-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK5-NEXT: br label [[DOTEXIT]] +// CHECK5: .exit: +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8* nonnull align 1 dereferenceable(1) [[C:%.*]], float* nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[C_ADDR:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[D_ADDR:%.*]] = alloca float*, align 4 +// CHECK5-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK5-NEXT: store i8* [[C]], i8** [[C_ADDR]], align 4 +// CHECK5-NEXT: store float* [[D]], float** [[D_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load i8*, i8** [[C_ADDR]], align 4 +// CHECK5-NEXT: [[TMP1:%.*]] = load float*, float** [[D_ADDR]], align 4 +// CHECK5-NEXT: [[TMP2:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 8, i16 1) +// CHECK5-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to %struct._globalized_locals_ty.2* +// CHECK5-NEXT: [[C1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], %struct._globalized_locals_ty.2* [[TMP3]], i32 0, i32 1 +// CHECK5-NEXT: [[D2:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], %struct._globalized_locals_ty.2* [[TMP3]], i32 0, i32 0 +// CHECK5-NEXT: store i8 0, i8* [[C1]], align 4 +// CHECK5-NEXT: store float 1.000000e+00, float* [[D2]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = load i8, i8* [[C1]], align 4 +// CHECK5-NEXT: [[CONV:%.*]] = sext i8 [[TMP4]] to i32 +// CHECK5-NEXT: [[XOR:%.*]] = xor i32 [[CONV]], 2 +// CHECK5-NEXT: [[CONV3:%.*]] = trunc i32 [[XOR]] to i8 +// CHECK5-NEXT: store i8 [[CONV3]], i8* [[C1]], align 4 +// CHECK5-NEXT: [[TMP5:%.*]] = load float, float* [[D2]], align 4 +// CHECK5-NEXT: [[MUL:%.*]] = fmul float [[TMP5]], 3.300000e+01 +// CHECK5-NEXT: store float [[MUL]], float* [[D2]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK5-NEXT: store i8* [[C1]], i8** [[TMP8]], align 4 +// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP10:%.*]] = bitcast float* [[D2]] to i8* +// CHECK5-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK5-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK5-NEXT: [[TMP12:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 +// CHECK5-NEXT: [[TMP13:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], i8* [[TMP12]], i32 1024, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func3, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func4, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func5, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func6, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func7, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func8) +// CHECK5-NEXT: [[TMP14:%.*]] = icmp eq i32 [[TMP13]], 1 +// CHECK5-NEXT: br i1 [[TMP14]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK5: .omp.reduction.then: +// CHECK5-NEXT: [[TMP15:%.*]] = load i8, i8* [[TMP0]], align 1 +// CHECK5-NEXT: [[CONV4:%.*]] = sext i8 [[TMP15]] to i32 +// CHECK5-NEXT: [[TMP16:%.*]] = load i8, i8* [[C1]], align 4 +// CHECK5-NEXT: [[CONV5:%.*]] = sext i8 [[TMP16]] to i32 +// CHECK5-NEXT: [[XOR6:%.*]] = xor i32 [[CONV4]], [[CONV5]] +// CHECK5-NEXT: [[CONV7:%.*]] = trunc i32 [[XOR6]] to i8 +// CHECK5-NEXT: store i8 [[CONV7]], i8* [[TMP0]], align 1 +// CHECK5-NEXT: [[TMP17:%.*]] = load float, float* [[TMP1]], align 4 +// CHECK5-NEXT: [[TMP18:%.*]] = load float, float* [[D2]], align 4 +// CHECK5-NEXT: [[MUL8:%.*]] = fmul float [[TMP17]], [[TMP18]] +// CHECK5-NEXT: store float [[MUL8]], float* [[TMP1]], align 4 +// CHECK5-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP7]]) +// CHECK5-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK5: .omp.reduction.done: +// CHECK5-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP2]]) +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func3 +// CHECK5-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK5-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK5-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK5-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i8, align 1 +// CHECK5-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca float, align 4 +// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK5-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK5-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK5-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* +// CHECK5-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK5-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK5-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 +// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1 +// CHECK5-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP10]], align 1 +// CHECK5-NEXT: [[TMP14:%.*]] = sext i8 [[TMP13]] to i32 +// CHECK5-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK5-NEXT: [[TMP15:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK5-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP14]], i16 [[TMP7]], i16 [[TMP15]]) +// CHECK5-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8 +// CHECK5-NEXT: store i8 [[TMP17]], i8* [[DOTOMP_REDUCTION_ELEMENT]], align 1 +// CHECK5-NEXT: [[TMP18:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1 +// CHECK5-NEXT: [[TMP19:%.*]] = getelementptr i8, i8* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 +// CHECK5-NEXT: store i8* [[DOTOMP_REDUCTION_ELEMENT]], i8** [[TMP11]], align 4 +// CHECK5-NEXT: [[TMP20:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP21:%.*]] = load i8*, i8** [[TMP20]], align 4 +// CHECK5-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP23:%.*]] = bitcast i8* [[TMP21]] to float* +// CHECK5-NEXT: [[TMP24:%.*]] = getelementptr float, float* [[TMP23]], i32 1 +// CHECK5-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to i8* +// CHECK5-NEXT: [[TMP26:%.*]] = bitcast float* [[TMP23]] to i32* +// CHECK5-NEXT: [[TMP27:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i32* +// CHECK5-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK5-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK5-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 +// CHECK5-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) +// CHECK5-NEXT: store i32 [[TMP30]], i32* [[TMP27]], align 4 +// CHECK5-NEXT: [[TMP31:%.*]] = getelementptr i32, i32* [[TMP26]], i32 1 +// CHECK5-NEXT: [[TMP32:%.*]] = getelementptr i32, i32* [[TMP27]], i32 1 +// CHECK5-NEXT: [[TMP33:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* +// CHECK5-NEXT: store i8* [[TMP33]], i8** [[TMP22]], align 4 +// CHECK5-NEXT: [[TMP34:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK5-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK5-NEXT: [[TMP36:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK5-NEXT: [[TMP37:%.*]] = and i1 [[TMP35]], [[TMP36]] +// CHECK5-NEXT: [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK5-NEXT: [[TMP39:%.*]] = and i16 [[TMP6]], 1 +// CHECK5-NEXT: [[TMP40:%.*]] = icmp eq i16 [[TMP39]], 0 +// CHECK5-NEXT: [[TMP41:%.*]] = and i1 [[TMP38]], [[TMP40]] +// CHECK5-NEXT: [[TMP42:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK5-NEXT: [[TMP43:%.*]] = and i1 [[TMP41]], [[TMP42]] +// CHECK5-NEXT: [[TMP44:%.*]] = or i1 [[TMP34]], [[TMP37]] +// CHECK5-NEXT: [[TMP45:%.*]] = or i1 [[TMP44]], [[TMP43]] +// CHECK5-NEXT: br i1 [[TMP45]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK5: then: +// CHECK5-NEXT: [[TMP46:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* +// CHECK5-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK5-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP46]], i8* [[TMP47]]) #[[ATTR3]] +// CHECK5-NEXT: br label [[IFCONT:%.*]] +// CHECK5: else: +// CHECK5-NEXT: br label [[IFCONT]] +// CHECK5: ifcont: +// CHECK5-NEXT: [[TMP48:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK5-NEXT: [[TMP49:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK5-NEXT: [[TMP50:%.*]] = and i1 [[TMP48]], [[TMP49]] +// CHECK5-NEXT: br i1 [[TMP50]], label [[THEN6:%.*]], label [[ELSE7:%.*]] +// CHECK5: then6: +// CHECK5-NEXT: [[TMP51:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP52:%.*]] = load i8*, i8** [[TMP51]], align 4 +// CHECK5-NEXT: [[TMP53:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP54:%.*]] = load i8*, i8** [[TMP53]], align 4 +// CHECK5-NEXT: [[TMP55:%.*]] = load i8, i8* [[TMP52]], align 1 +// CHECK5-NEXT: store i8 [[TMP55]], i8* [[TMP54]], align 1 +// CHECK5-NEXT: [[TMP56:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP57:%.*]] = load i8*, i8** [[TMP56]], align 4 +// CHECK5-NEXT: [[TMP58:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP59:%.*]] = load i8*, i8** [[TMP58]], align 4 +// CHECK5-NEXT: [[TMP60:%.*]] = bitcast i8* [[TMP57]] to float* +// CHECK5-NEXT: [[TMP61:%.*]] = bitcast i8* [[TMP59]] to float* +// CHECK5-NEXT: [[TMP62:%.*]] = load float, float* [[TMP60]], align 4 +// CHECK5-NEXT: store float [[TMP62]], float* [[TMP61]], align 4 +// CHECK5-NEXT: br label [[IFCONT8:%.*]] +// CHECK5: else7: +// CHECK5-NEXT: br label [[IFCONT8]] +// CHECK5: ifcont8: +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func4 +// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK5-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK5-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK5-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK5: then: +// CHECK5-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 +// CHECK5-NEXT: [[TMP7:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK5-NEXT: [[TMP8:%.*]] = bitcast i32 addrspace(3)* [[TMP7]] to i8 addrspace(3)* +// CHECK5-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP6]], align 1 +// CHECK5-NEXT: store volatile i8 [[TMP9]], i8 addrspace(3)* [[TMP8]], align 1 +// CHECK5-NEXT: br label [[IFCONT:%.*]] +// CHECK5: else: +// CHECK5-NEXT: br label [[IFCONT]] +// CHECK5: ifcont: +// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] +// CHECK5-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK5: then4: +// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK5-NEXT: [[TMP12:%.*]] = bitcast i32 addrspace(3)* [[TMP11]] to i8 addrspace(3)* +// CHECK5-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 +// CHECK5-NEXT: [[TMP15:%.*]] = load volatile i8, i8 addrspace(3)* [[TMP12]], align 1 +// CHECK5-NEXT: store i8 [[TMP15]], i8* [[TMP14]], align 1 +// CHECK5-NEXT: br label [[IFCONT6:%.*]] +// CHECK5: else5: +// CHECK5-NEXT: br label [[IFCONT6]] +// CHECK5: ifcont6: +// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK5-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK5-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] +// CHECK5: then8: +// CHECK5-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 +// CHECK5-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i32* +// CHECK5-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK5-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP18]], align 4 +// CHECK5-NEXT: store volatile i32 [[TMP20]], i32 addrspace(3)* [[TMP19]], align 4 +// CHECK5-NEXT: br label [[IFCONT10:%.*]] +// CHECK5: else9: +// CHECK5-NEXT: br label [[IFCONT10]] +// CHECK5: ifcont10: +// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK5-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP21]] +// CHECK5-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] +// CHECK5: then12: +// CHECK5-NEXT: [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK5-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP24:%.*]] = load i8*, i8** [[TMP23]], align 4 +// CHECK5-NEXT: [[TMP25:%.*]] = bitcast i8* [[TMP24]] to i32* +// CHECK5-NEXT: [[TMP26:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP22]], align 4 +// CHECK5-NEXT: store i32 [[TMP26]], i32* [[TMP25]], align 4 +// CHECK5-NEXT: br label [[IFCONT14:%.*]] +// CHECK5: else13: +// CHECK5-NEXT: br label [[IFCONT14]] +// CHECK5: ifcont14: +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func5 +// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK5-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.3* +// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK5-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP7]] +// CHECK5-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]], align 1 +// CHECK5-NEXT: store i8 [[TMP11]], i8* [[TMP10]], align 128 +// CHECK5-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 +// CHECK5-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float* +// CHECK5-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP7]] +// CHECK5-NEXT: [[TMP16:%.*]] = load float, float* [[TMP14]], align 4 +// CHECK5-NEXT: store float [[TMP16]], float* [[TMP15]], align 128 +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func6 +// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.3* +// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK5-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP5]] +// CHECK5-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK5-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP5]] +// CHECK5-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8* +// CHECK5-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 4 +// CHECK5-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK5-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK5-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP11]], i8* [[TMP12]]) #[[ATTR3]] +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func7 +// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK5-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.3* +// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK5-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP7]] +// CHECK5-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP10]], align 128 +// CHECK5-NEXT: store i8 [[TMP11]], i8* [[TMP9]], align 1 +// CHECK5-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 +// CHECK5-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float* +// CHECK5-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP7]] +// CHECK5-NEXT: [[TMP16:%.*]] = load float, float* [[TMP15]], align 128 +// CHECK5-NEXT: store float [[TMP16]], float* [[TMP14]], align 4 +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func8 +// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.3* +// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK5-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP5]] +// CHECK5-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK5-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP5]] +// CHECK5-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8* +// CHECK5-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 4 +// CHECK5-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK5-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK5-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP12]], i8* [[TMP11]]) #[[ATTR3]] +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l36 +// CHECK5-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[B]], i32* [[B_ADDR]], align 4 +// CHECK5-NEXT: [[CONV:%.*]] = bitcast i32* [[B_ADDR]] to i16* +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) +// CHECK5-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() +// CHECK5-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK5: .execute: +// CHECK5-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK5-NEXT: store i32 [[TMP0]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK5-NEXT: call void @__omp_outlined__9(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[A_ADDR]], i16* [[CONV]]) #[[ATTR3]] +// CHECK5-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK5: .omp.deinit: +// CHECK5-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) +// CHECK5-NEXT: br label [[DOTEXIT:%.*]] +// CHECK5: .exit: +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__9 +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 4 +// CHECK5-NEXT: [[A1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[B2:%.*]] = alloca i16, align 2 +// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4 +// CHECK5-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK5-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK5-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[A1]], align 4 +// CHECK5-NEXT: store i16 -32768, i16* [[B2]], align 2 +// CHECK5-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP3:%.*]] = bitcast i32* [[A1]] to i8* +// CHECK5-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP5:%.*]] = bitcast i16* [[B2]] to i8* +// CHECK5-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*)* @__omp_outlined__10 to i8*), i8* null, i8** [[TMP8]], i32 2) +// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP10:%.*]] = bitcast i32* [[A1]] to i8* +// CHECK5-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP12:%.*]] = bitcast i16* [[B2]] to i8* +// CHECK5-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK5-NEXT: [[TMP13:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK5-NEXT: [[TMP14:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 +// CHECK5-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i8* [[TMP14]], i32 1024, i8* [[TMP13]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func15, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func16, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func17, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func18, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func19, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func20) +// CHECK5-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP15]], 1 +// CHECK5-NEXT: br i1 [[TMP16]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK5: .omp.reduction.then: +// CHECK5-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK5-NEXT: [[TMP18:%.*]] = load i32, i32* [[A1]], align 4 +// CHECK5-NEXT: [[OR:%.*]] = or i32 [[TMP17]], [[TMP18]] +// CHECK5-NEXT: store i32 [[OR]], i32* [[TMP0]], align 4 +// CHECK5-NEXT: [[TMP19:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK5-NEXT: [[CONV:%.*]] = sext i16 [[TMP19]] to i32 +// CHECK5-NEXT: [[TMP20:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK5-NEXT: [[CONV3:%.*]] = sext i16 [[TMP20]] to i32 +// CHECK5-NEXT: [[CMP:%.*]] = icmp sgt i32 [[CONV]], [[CONV3]] +// CHECK5-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK5: cond.true: +// CHECK5-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK5-NEXT: br label [[COND_END:%.*]] +// CHECK5: cond.false: +// CHECK5-NEXT: [[TMP22:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK5-NEXT: br label [[COND_END]] +// CHECK5: cond.end: +// CHECK5-NEXT: [[COND:%.*]] = phi i16 [ [[TMP21]], [[COND_TRUE]] ], [ [[TMP22]], [[COND_FALSE]] ] +// CHECK5-NEXT: store i16 [[COND]], i16* [[TMP1]], align 2 +// CHECK5-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP7]]) +// CHECK5-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK5: .omp.reduction.done: +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__10 +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 4 +// CHECK5-NEXT: [[A1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[B2:%.*]] = alloca i16, align 2 +// CHECK5-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK5-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK5-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[A1]], align 4 +// CHECK5-NEXT: store i16 -32768, i16* [[B2]], align 2 +// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[A1]], align 4 +// CHECK5-NEXT: [[OR:%.*]] = or i32 [[TMP2]], 1 +// CHECK5-NEXT: store i32 [[OR]], i32* [[A1]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK5-NEXT: [[CONV:%.*]] = sext i16 [[TMP3]] to i32 +// CHECK5-NEXT: [[CMP:%.*]] = icmp sgt i32 99, [[CONV]] +// CHECK5-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK5: cond.true: +// CHECK5-NEXT: br label [[COND_END:%.*]] +// CHECK5: cond.false: +// CHECK5-NEXT: [[TMP4:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK5-NEXT: [[CONV3:%.*]] = sext i16 [[TMP4]] to i32 +// CHECK5-NEXT: br label [[COND_END]] +// CHECK5: cond.end: +// CHECK5-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[CONV3]], [[COND_FALSE]] ] +// CHECK5-NEXT: [[CONV4:%.*]] = trunc i32 [[COND]] to i16 +// CHECK5-NEXT: store i16 [[CONV4]], i16* [[B2]], align 2 +// CHECK5-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK5-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP8:%.*]] = bitcast i32* [[A1]] to i8* +// CHECK5-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 4 +// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP10:%.*]] = bitcast i16* [[B2]] to i8* +// CHECK5-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK5-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK5-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP6]], i32 2, i32 8, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func12, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func13) +// CHECK5-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP12]], 1 +// CHECK5-NEXT: br i1 [[TMP13]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK5: .omp.reduction.then: +// CHECK5-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK5-NEXT: [[TMP15:%.*]] = load i32, i32* [[A1]], align 4 +// CHECK5-NEXT: [[OR5:%.*]] = or i32 [[TMP14]], [[TMP15]] +// CHECK5-NEXT: store i32 [[OR5]], i32* [[TMP0]], align 4 +// CHECK5-NEXT: [[TMP16:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK5-NEXT: [[CONV6:%.*]] = sext i16 [[TMP16]] to i32 +// CHECK5-NEXT: [[TMP17:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK5-NEXT: [[CONV7:%.*]] = sext i16 [[TMP17]] to i32 +// CHECK5-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[CONV6]], [[CONV7]] +// CHECK5-NEXT: br i1 [[CMP8]], label [[COND_TRUE9:%.*]], label [[COND_FALSE10:%.*]] +// CHECK5: cond.true9: +// CHECK5-NEXT: [[TMP18:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK5-NEXT: br label [[COND_END11:%.*]] +// CHECK5: cond.false10: +// CHECK5-NEXT: [[TMP19:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK5-NEXT: br label [[COND_END11]] +// CHECK5: cond.end11: +// CHECK5-NEXT: [[COND12:%.*]] = phi i16 [ [[TMP18]], [[COND_TRUE9]] ], [ [[TMP19]], [[COND_FALSE10]] ] +// CHECK5-NEXT: store i16 [[COND12]], i16* [[TMP1]], align 2 +// CHECK5-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP6]]) +// CHECK5-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK5: .omp.reduction.done: +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func12 +// CHECK5-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK5-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK5-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK5-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 +// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK5-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK5-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK5-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* +// CHECK5-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK5-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK5-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 +// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* +// CHECK5-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 +// CHECK5-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* +// CHECK5-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 +// CHECK5-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK5-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK5-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]]) +// CHECK5-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 +// CHECK5-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 +// CHECK5-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 +// CHECK5-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* +// CHECK5-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 4 +// CHECK5-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 4 +// CHECK5-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16* +// CHECK5-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 +// CHECK5-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8* +// CHECK5-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2 +// CHECK5-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32 +// CHECK5-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK5-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 +// CHECK5-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) +// CHECK5-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 +// CHECK5-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 +// CHECK5-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 +// CHECK5-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1 +// CHECK5-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* +// CHECK5-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 4 +// CHECK5-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK5-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK5-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK5-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]] +// CHECK5-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK5-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1 +// CHECK5-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0 +// CHECK5-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]] +// CHECK5-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK5-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]] +// CHECK5-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]] +// CHECK5-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]] +// CHECK5-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK5: then: +// CHECK5-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* +// CHECK5-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK5-NEXT: call void @"_omp$reduction$reduction_func11"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]] +// CHECK5-NEXT: br label [[IFCONT:%.*]] +// CHECK5: else: +// CHECK5-NEXT: br label [[IFCONT]] +// CHECK5: ifcont: +// CHECK5-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK5-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK5-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]] +// CHECK5-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]] +// CHECK5: then6: +// CHECK5-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 4 +// CHECK5-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4 +// CHECK5-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32* +// CHECK5-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32* +// CHECK5-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4 +// CHECK5-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4 +// CHECK5-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 4 +// CHECK5-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4 +// CHECK5-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16* +// CHECK5-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16* +// CHECK5-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2 +// CHECK5-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2 +// CHECK5-NEXT: br label [[IFCONT8:%.*]] +// CHECK5: else7: +// CHECK5-NEXT: br label [[IFCONT8]] +// CHECK5: ifcont8: +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func13 +// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK5-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4:[0-9]+]], i32 [[TMP2]]) +// CHECK5-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK5-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK5: then: +// CHECK5-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 +// CHECK5-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* +// CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK5-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4 +// CHECK5-NEXT: br label [[IFCONT:%.*]] +// CHECK5: else: +// CHECK5-NEXT: br label [[IFCONT]] +// CHECK5: ifcont: +// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] +// CHECK5-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK5: then4: +// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK5-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 +// CHECK5-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32* +// CHECK5-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4 +// CHECK5-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4 +// CHECK5-NEXT: br label [[IFCONT6:%.*]] +// CHECK5: else5: +// CHECK5-NEXT: br label [[IFCONT6]] +// CHECK5: ifcont6: +// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK5-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK5-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] +// CHECK5: then8: +// CHECK5-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 +// CHECK5-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16* +// CHECK5-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK5-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)* +// CHECK5-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2 +// CHECK5-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2 +// CHECK5-NEXT: br label [[IFCONT10:%.*]] +// CHECK5: else9: +// CHECK5-NEXT: br label [[IFCONT10]] +// CHECK5: ifcont10: +// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK5-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]] +// CHECK5-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] +// CHECK5: then12: +// CHECK5-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK5-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)* +// CHECK5-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 4 +// CHECK5-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16* +// CHECK5-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2 +// CHECK5-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2 +// CHECK5-NEXT: br label [[IFCONT14:%.*]] +// CHECK5: else13: +// CHECK5-NEXT: br label [[IFCONT14]] +// CHECK5: ifcont14: +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func15 +// CHECK5-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK5-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK5-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK5-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 +// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK5-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK5-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK5-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* +// CHECK5-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK5-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK5-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 +// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* +// CHECK5-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 +// CHECK5-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* +// CHECK5-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 +// CHECK5-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK5-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK5-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]]) +// CHECK5-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 +// CHECK5-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 +// CHECK5-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 +// CHECK5-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* +// CHECK5-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 4 +// CHECK5-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 4 +// CHECK5-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16* +// CHECK5-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 +// CHECK5-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8* +// CHECK5-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2 +// CHECK5-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32 +// CHECK5-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK5-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 +// CHECK5-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) +// CHECK5-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 +// CHECK5-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 +// CHECK5-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 +// CHECK5-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1 +// CHECK5-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* +// CHECK5-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 4 +// CHECK5-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK5-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK5-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK5-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]] +// CHECK5-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK5-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1 +// CHECK5-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0 +// CHECK5-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]] +// CHECK5-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK5-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]] +// CHECK5-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]] +// CHECK5-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]] +// CHECK5-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK5: then: +// CHECK5-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* +// CHECK5-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK5-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]] +// CHECK5-NEXT: br label [[IFCONT:%.*]] +// CHECK5: else: +// CHECK5-NEXT: br label [[IFCONT]] +// CHECK5: ifcont: +// CHECK5-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK5-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK5-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]] +// CHECK5-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]] +// CHECK5: then6: +// CHECK5-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 4 +// CHECK5-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4 +// CHECK5-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32* +// CHECK5-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32* +// CHECK5-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4 +// CHECK5-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4 +// CHECK5-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 4 +// CHECK5-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4 +// CHECK5-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16* +// CHECK5-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16* +// CHECK5-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2 +// CHECK5-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2 +// CHECK5-NEXT: br label [[IFCONT8:%.*]] +// CHECK5: else7: +// CHECK5-NEXT: br label [[IFCONT8]] +// CHECK5: ifcont8: +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func16 +// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK5-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK5-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK5-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK5: then: +// CHECK5-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 +// CHECK5-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* +// CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK5-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4 +// CHECK5-NEXT: br label [[IFCONT:%.*]] +// CHECK5: else: +// CHECK5-NEXT: br label [[IFCONT]] +// CHECK5: ifcont: +// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] +// CHECK5-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK5: then4: +// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK5-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 +// CHECK5-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32* +// CHECK5-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4 +// CHECK5-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4 +// CHECK5-NEXT: br label [[IFCONT6:%.*]] +// CHECK5: else5: +// CHECK5-NEXT: br label [[IFCONT6]] +// CHECK5: ifcont6: +// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK5-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK5-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] +// CHECK5: then8: +// CHECK5-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 +// CHECK5-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16* +// CHECK5-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK5-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)* +// CHECK5-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2 +// CHECK5-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2 +// CHECK5-NEXT: br label [[IFCONT10:%.*]] +// CHECK5: else9: +// CHECK5-NEXT: br label [[IFCONT10]] +// CHECK5: ifcont10: +// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK5-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]] +// CHECK5-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] +// CHECK5: then12: +// CHECK5-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK5-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)* +// CHECK5-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 4 +// CHECK5-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16* +// CHECK5-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2 +// CHECK5-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2 +// CHECK5-NEXT: br label [[IFCONT14:%.*]] +// CHECK5: else13: +// CHECK5-NEXT: br label [[IFCONT14]] +// CHECK5: ifcont14: +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func17 +// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK5-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.4* +// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK5-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* +// CHECK5-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP7]] +// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK5-NEXT: store i32 [[TMP12]], i32* [[TMP11]], align 128 +// CHECK5-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 +// CHECK5-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16* +// CHECK5-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP16:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP7]] +// CHECK5-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP15]], align 2 +// CHECK5-NEXT: store i16 [[TMP17]], i16* [[TMP16]], align 128 +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func18 +// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.4* +// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK5-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP5]] +// CHECK5-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* +// CHECK5-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 +// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK5-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP5]] +// CHECK5-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8* +// CHECK5-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 4 +// CHECK5-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK5-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK5-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP12]], i8* [[TMP13]]) #[[ATTR3]] +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func19 +// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK5-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.4* +// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK5-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* +// CHECK5-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP7]] +// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 128 +// CHECK5-NEXT: store i32 [[TMP12]], i32* [[TMP10]], align 4 +// CHECK5-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 +// CHECK5-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16* +// CHECK5-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP16:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP7]] +// CHECK5-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP16]], align 128 +// CHECK5-NEXT: store i16 [[TMP17]], i16* [[TMP15]], align 2 +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func20 +// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.4* +// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK5-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP5]] +// CHECK5-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* +// CHECK5-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 +// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK5-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP5]] +// CHECK5-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8* +// CHECK5-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 4 +// CHECK5-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK5-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK5-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP13]], i8* [[TMP12]]) #[[ATTR3]] +// CHECK5-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_worker +// CHECK6-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK6-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK6-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK6-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK6: .await.work: +// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK6-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK6-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK6-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK6-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK6-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK6-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK6: .select.workers: +// CHECK6-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK6-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK6-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK6: .execute.parallel: +// CHECK6-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK6-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK6-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK6-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK6: .terminate.parallel: +// CHECK6-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK6-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK6: .barrier.parallel: +// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK6-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK6: .exit: +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23 +// CHECK6-SAME: (double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1:[0-9]+]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[E_ADDR:%.*]] = alloca double*, align 4 +// CHECK6-NEXT: [[E7:%.*]] = alloca double, align 8 +// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK6-NEXT: store double* [[E]], double** [[E_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4 +// CHECK6-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK6-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK6-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK6-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK6: .worker: +// CHECK6-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_worker() #[[ATTR3:[0-9]+]] +// CHECK6-NEXT: br label [[DOTEXIT:%.*]] +// CHECK6: .mastercheck: +// CHECK6-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK6-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK6-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK6-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK6-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1 +// CHECK6-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]] +// CHECK6-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK6-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK6: .master: +// CHECK6-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK6-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK6-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK6-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK6-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK6-NEXT: [[TMP7:%.*]] = load double, double* [[TMP0]], align 8 +// CHECK6-NEXT: store double [[TMP7]], double* [[E7]], align 8 +// CHECK6-NEXT: store i32 [[TMP6]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK6-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], double* [[E7]]) #[[ATTR3]] +// CHECK6-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK6: .termination.notifier: +// CHECK6-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK6-NEXT: br label [[DOTEXIT]] +// CHECK6: .exit: +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[E_ADDR:%.*]] = alloca double*, align 4 +// CHECK6-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK6-NEXT: store double* [[E]], double** [[E_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4 +// CHECK6-NEXT: [[TMP1:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 8, i16 1) +// CHECK6-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to %struct._globalized_locals_ty* +// CHECK6-NEXT: [[E1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP2]], i32 0, i32 0 +// CHECK6-NEXT: store double 0.000000e+00, double* [[E1]], align 8 +// CHECK6-NEXT: [[TMP3:%.*]] = load double, double* [[E1]], align 8 +// CHECK6-NEXT: [[ADD:%.*]] = fadd double [[TMP3]], 5.000000e+00 +// CHECK6-NEXT: store double [[ADD]], double* [[E1]], align 8 +// CHECK6-NEXT: [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP7:%.*]] = bitcast double* [[E1]] to i8* +// CHECK6-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK6-NEXT: [[TMP9:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 +// CHECK6-NEXT: [[TMP10:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i8* [[TMP9]], i32 2048, i8* [[TMP8]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func) +// CHECK6-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP10]], 1 +// CHECK6-NEXT: br i1 [[TMP11]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK6: .omp.reduction.then: +// CHECK6-NEXT: [[TMP12:%.*]] = load double, double* [[TMP0]], align 8 +// CHECK6-NEXT: [[TMP13:%.*]] = load double, double* [[E1]], align 8 +// CHECK6-NEXT: [[ADD2:%.*]] = fadd double [[TMP12]], [[TMP13]] +// CHECK6-NEXT: store double [[ADD2]], double* [[TMP0]], align 8 +// CHECK6-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]]) +// CHECK6-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK6: .omp.reduction.done: +// CHECK6-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP1]]) +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func +// CHECK6-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK6-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK6-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x i8*], align 4 +// CHECK6-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca double, align 8 +// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK6-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK6-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK6-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [1 x i8*]* +// CHECK6-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK6-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK6-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 +// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to double* +// CHECK6-NEXT: [[TMP13:%.*]] = getelementptr double, double* [[TMP12]], i32 1 +// CHECK6-NEXT: [[TMP14:%.*]] = bitcast double* [[TMP13]] to i8* +// CHECK6-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP12]] to i64* +// CHECK6-NEXT: [[TMP16:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i64* +// CHECK6-NEXT: [[TMP17:%.*]] = load i64, i64* [[TMP15]], align 8 +// CHECK6-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK6-NEXT: [[TMP18:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK6-NEXT: [[TMP19:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP17]], i16 [[TMP7]], i16 [[TMP18]]) +// CHECK6-NEXT: store i64 [[TMP19]], i64* [[TMP16]], align 8 +// CHECK6-NEXT: [[TMP20:%.*]] = getelementptr i64, i64* [[TMP15]], i32 1 +// CHECK6-NEXT: [[TMP21:%.*]] = getelementptr i64, i64* [[TMP16]], i32 1 +// CHECK6-NEXT: [[TMP22:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i8* +// CHECK6-NEXT: store i8* [[TMP22]], i8** [[TMP11]], align 4 +// CHECK6-NEXT: [[TMP23:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK6-NEXT: [[TMP24:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK6-NEXT: [[TMP25:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK6-NEXT: [[TMP26:%.*]] = and i1 [[TMP24]], [[TMP25]] +// CHECK6-NEXT: [[TMP27:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK6-NEXT: [[TMP28:%.*]] = and i16 [[TMP6]], 1 +// CHECK6-NEXT: [[TMP29:%.*]] = icmp eq i16 [[TMP28]], 0 +// CHECK6-NEXT: [[TMP30:%.*]] = and i1 [[TMP27]], [[TMP29]] +// CHECK6-NEXT: [[TMP31:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK6-NEXT: [[TMP32:%.*]] = and i1 [[TMP30]], [[TMP31]] +// CHECK6-NEXT: [[TMP33:%.*]] = or i1 [[TMP23]], [[TMP26]] +// CHECK6-NEXT: [[TMP34:%.*]] = or i1 [[TMP33]], [[TMP32]] +// CHECK6-NEXT: br i1 [[TMP34]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK6: then: +// CHECK6-NEXT: [[TMP35:%.*]] = bitcast [1 x i8*]* [[TMP5]] to i8* +// CHECK6-NEXT: [[TMP36:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK6-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP35]], i8* [[TMP36]]) #[[ATTR3]] +// CHECK6-NEXT: br label [[IFCONT:%.*]] +// CHECK6: else: +// CHECK6-NEXT: br label [[IFCONT]] +// CHECK6: ifcont: +// CHECK6-NEXT: [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK6-NEXT: [[TMP38:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK6-NEXT: [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]] +// CHECK6-NEXT: br i1 [[TMP39]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK6: then4: +// CHECK6-NEXT: [[TMP40:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP41:%.*]] = load i8*, i8** [[TMP40]], align 4 +// CHECK6-NEXT: [[TMP42:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP43:%.*]] = load i8*, i8** [[TMP42]], align 4 +// CHECK6-NEXT: [[TMP44:%.*]] = bitcast i8* [[TMP41]] to double* +// CHECK6-NEXT: [[TMP45:%.*]] = bitcast i8* [[TMP43]] to double* +// CHECK6-NEXT: [[TMP46:%.*]] = load double, double* [[TMP44]], align 8 +// CHECK6-NEXT: store double [[TMP46]], double* [[TMP45]], align 8 +// CHECK6-NEXT: br label [[IFCONT6:%.*]] +// CHECK6: else5: +// CHECK6-NEXT: br label [[IFCONT6]] +// CHECK6: ifcont6: +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func +// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCNT_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK6-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK6-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK6-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK6-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* +// CHECK6-NEXT: store i32 0, i32* [[DOTCNT_ADDR]], align 4 +// CHECK6-NEXT: br label [[PRECOND:%.*]] +// CHECK6: precond: +// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCNT_ADDR]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = icmp ult i32 [[TMP5]], 2 +// CHECK6-NEXT: br i1 [[TMP6]], label [[BODY:%.*]], label [[EXIT:%.*]] +// CHECK6: body: +// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP2]]) +// CHECK6-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK6-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK6: then: +// CHECK6-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP8:%.*]] = load i8*, i8** [[TMP7]], align 4 +// CHECK6-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32* +// CHECK6-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[TMP9]], i32 [[TMP5]] +// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK6-NEXT: store volatile i32 [[TMP12]], i32 addrspace(3)* [[TMP11]], align 4 +// CHECK6-NEXT: br label [[IFCONT:%.*]] +// CHECK6: else: +// CHECK6-NEXT: br label [[IFCONT]] +// CHECK6: ifcont: +// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK6-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP13]] +// CHECK6-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK6: then4: +// CHECK6-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK6-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 4 +// CHECK6-NEXT: [[TMP17:%.*]] = bitcast i8* [[TMP16]] to i32* +// CHECK6-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP17]], i32 [[TMP5]] +// CHECK6-NEXT: [[TMP19:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4 +// CHECK6-NEXT: store i32 [[TMP19]], i32* [[TMP18]], align 4 +// CHECK6-NEXT: br label [[IFCONT6:%.*]] +// CHECK6: else5: +// CHECK6-NEXT: br label [[IFCONT6]] +// CHECK6: ifcont6: +// CHECK6-NEXT: [[TMP20:%.*]] = add nsw i32 [[TMP5]], 1 +// CHECK6-NEXT: store i32 [[TMP20]], i32* [[DOTCNT_ADDR]], align 4 +// CHECK6-NEXT: br label [[PRECOND]] +// CHECK6: exit: +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func +// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* +// CHECK6-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0* +// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK6-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double* +// CHECK6-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP7]] +// CHECK6-NEXT: [[TMP12:%.*]] = load double, double* [[TMP10]], align 8 +// CHECK6-NEXT: store double [[TMP12]], double* [[TMP11]], align 128 +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func +// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 +// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0* +// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK6-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP5]] +// CHECK6-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8* +// CHECK6-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 +// CHECK6-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK6-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK6-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP9]], i8* [[TMP10]]) #[[ATTR3]] +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func +// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* +// CHECK6-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0* +// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK6-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double* +// CHECK6-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP7]] +// CHECK6-NEXT: [[TMP12:%.*]] = load double, double* [[TMP11]], align 128 +// CHECK6-NEXT: store double [[TMP12]], double* [[TMP10]], align 8 +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func +// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 +// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0* +// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK6-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP5]] +// CHECK6-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8* +// CHECK6-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 +// CHECK6-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK6-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK6-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP10]], i8* [[TMP9]]) #[[ATTR3]] +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29_worker +// CHECK6-SAME: () #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK6-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK6-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK6-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK6: .await.work: +// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK6-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK6-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK6-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK6-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK6-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK6-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK6: .select.workers: +// CHECK6-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK6-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK6-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK6: .execute.parallel: +// CHECK6-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK6-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK6-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK6-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK6: .terminate.parallel: +// CHECK6-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK6-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK6: .barrier.parallel: +// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK6-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK6: .exit: +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29 +// CHECK6-SAME: (i32 [[C:%.*]], i32 [[D:%.*]]) #[[ATTR1]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[C_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[D_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[C]], i32* [[C_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[D]], i32* [[D_ADDR]], align 4 +// CHECK6-NEXT: [[CONV:%.*]] = bitcast i32* [[C_ADDR]] to i8* +// CHECK6-NEXT: [[CONV1:%.*]] = bitcast i32* [[D_ADDR]] to float* +// CHECK6-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK6-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK6-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK6-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK6: .worker: +// CHECK6-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29_worker() #[[ATTR3]] +// CHECK6-NEXT: br label [[DOTEXIT:%.*]] +// CHECK6: .mastercheck: +// CHECK6-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK6-NEXT: [[NVPTX_NUM_THREADS3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: [[NVPTX_WARP_SIZE4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK6-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE4]], 1 +// CHECK6-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS3]], 1 +// CHECK6-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK6-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK6-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID2]], [[MASTER_TID]] +// CHECK6-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK6: .master: +// CHECK6-NEXT: [[NVPTX_NUM_THREADS5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: [[NVPTX_WARP_SIZE6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK6-NEXT: [[THREAD_LIMIT7:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS5]], [[NVPTX_WARP_SIZE6]] +// CHECK6-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT7]], i16 1) +// CHECK6-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK6-NEXT: [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 8, i16 1) +// CHECK6-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.1* +// CHECK6-NEXT: [[TMP7:%.*]] = load i8, i8* [[CONV]], align 4 +// CHECK6-NEXT: [[C8:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 1 +// CHECK6-NEXT: store i8 [[TMP7]], i8* [[C8]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = load float, float* [[CONV1]], align 4 +// CHECK6-NEXT: [[D9:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 0 +// CHECK6-NEXT: store float [[TMP8]], float* [[D9]], align 4 +// CHECK6-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK6-NEXT: store i32 [[TMP9]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK6-NEXT: call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8* [[C8]], float* [[D9]]) #[[ATTR3]] +// CHECK6-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]]) +// CHECK6-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK6: .termination.notifier: +// CHECK6-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK6-NEXT: br label [[DOTEXIT]] +// CHECK6: .exit: +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8* nonnull align 1 dereferenceable(1) [[C:%.*]], float* nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[C_ADDR:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[D_ADDR:%.*]] = alloca float*, align 4 +// CHECK6-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK6-NEXT: store i8* [[C]], i8** [[C_ADDR]], align 4 +// CHECK6-NEXT: store float* [[D]], float** [[D_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load i8*, i8** [[C_ADDR]], align 4 +// CHECK6-NEXT: [[TMP1:%.*]] = load float*, float** [[D_ADDR]], align 4 +// CHECK6-NEXT: [[TMP2:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 8, i16 1) +// CHECK6-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to %struct._globalized_locals_ty.2* +// CHECK6-NEXT: [[C1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], %struct._globalized_locals_ty.2* [[TMP3]], i32 0, i32 1 +// CHECK6-NEXT: [[D2:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], %struct._globalized_locals_ty.2* [[TMP3]], i32 0, i32 0 +// CHECK6-NEXT: store i8 0, i8* [[C1]], align 4 +// CHECK6-NEXT: store float 1.000000e+00, float* [[D2]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = load i8, i8* [[C1]], align 4 +// CHECK6-NEXT: [[CONV:%.*]] = sext i8 [[TMP4]] to i32 +// CHECK6-NEXT: [[XOR:%.*]] = xor i32 [[CONV]], 2 +// CHECK6-NEXT: [[CONV3:%.*]] = trunc i32 [[XOR]] to i8 +// CHECK6-NEXT: store i8 [[CONV3]], i8* [[C1]], align 4 +// CHECK6-NEXT: [[TMP5:%.*]] = load float, float* [[D2]], align 4 +// CHECK6-NEXT: [[MUL:%.*]] = fmul float [[TMP5]], 3.300000e+01 +// CHECK6-NEXT: store float [[MUL]], float* [[D2]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK6-NEXT: store i8* [[C1]], i8** [[TMP8]], align 4 +// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP10:%.*]] = bitcast float* [[D2]] to i8* +// CHECK6-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK6-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK6-NEXT: [[TMP12:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 +// CHECK6-NEXT: [[TMP13:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], i8* [[TMP12]], i32 2048, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func3, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func4, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func5, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func6, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func7, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func8) +// CHECK6-NEXT: [[TMP14:%.*]] = icmp eq i32 [[TMP13]], 1 +// CHECK6-NEXT: br i1 [[TMP14]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK6: .omp.reduction.then: +// CHECK6-NEXT: [[TMP15:%.*]] = load i8, i8* [[TMP0]], align 1 +// CHECK6-NEXT: [[CONV4:%.*]] = sext i8 [[TMP15]] to i32 +// CHECK6-NEXT: [[TMP16:%.*]] = load i8, i8* [[C1]], align 4 +// CHECK6-NEXT: [[CONV5:%.*]] = sext i8 [[TMP16]] to i32 +// CHECK6-NEXT: [[XOR6:%.*]] = xor i32 [[CONV4]], [[CONV5]] +// CHECK6-NEXT: [[CONV7:%.*]] = trunc i32 [[XOR6]] to i8 +// CHECK6-NEXT: store i8 [[CONV7]], i8* [[TMP0]], align 1 +// CHECK6-NEXT: [[TMP17:%.*]] = load float, float* [[TMP1]], align 4 +// CHECK6-NEXT: [[TMP18:%.*]] = load float, float* [[D2]], align 4 +// CHECK6-NEXT: [[MUL8:%.*]] = fmul float [[TMP17]], [[TMP18]] +// CHECK6-NEXT: store float [[MUL8]], float* [[TMP1]], align 4 +// CHECK6-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP7]]) +// CHECK6-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK6: .omp.reduction.done: +// CHECK6-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP2]]) +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func3 +// CHECK6-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK6-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK6-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK6-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i8, align 1 +// CHECK6-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca float, align 4 +// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK6-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK6-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK6-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* +// CHECK6-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK6-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK6-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 +// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1 +// CHECK6-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP10]], align 1 +// CHECK6-NEXT: [[TMP14:%.*]] = sext i8 [[TMP13]] to i32 +// CHECK6-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK6-NEXT: [[TMP15:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK6-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP14]], i16 [[TMP7]], i16 [[TMP15]]) +// CHECK6-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8 +// CHECK6-NEXT: store i8 [[TMP17]], i8* [[DOTOMP_REDUCTION_ELEMENT]], align 1 +// CHECK6-NEXT: [[TMP18:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1 +// CHECK6-NEXT: [[TMP19:%.*]] = getelementptr i8, i8* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 +// CHECK6-NEXT: store i8* [[DOTOMP_REDUCTION_ELEMENT]], i8** [[TMP11]], align 4 +// CHECK6-NEXT: [[TMP20:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP21:%.*]] = load i8*, i8** [[TMP20]], align 4 +// CHECK6-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP23:%.*]] = bitcast i8* [[TMP21]] to float* +// CHECK6-NEXT: [[TMP24:%.*]] = getelementptr float, float* [[TMP23]], i32 1 +// CHECK6-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to i8* +// CHECK6-NEXT: [[TMP26:%.*]] = bitcast float* [[TMP23]] to i32* +// CHECK6-NEXT: [[TMP27:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i32* +// CHECK6-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK6-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK6-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 +// CHECK6-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) +// CHECK6-NEXT: store i32 [[TMP30]], i32* [[TMP27]], align 4 +// CHECK6-NEXT: [[TMP31:%.*]] = getelementptr i32, i32* [[TMP26]], i32 1 +// CHECK6-NEXT: [[TMP32:%.*]] = getelementptr i32, i32* [[TMP27]], i32 1 +// CHECK6-NEXT: [[TMP33:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* +// CHECK6-NEXT: store i8* [[TMP33]], i8** [[TMP22]], align 4 +// CHECK6-NEXT: [[TMP34:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK6-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK6-NEXT: [[TMP36:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK6-NEXT: [[TMP37:%.*]] = and i1 [[TMP35]], [[TMP36]] +// CHECK6-NEXT: [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK6-NEXT: [[TMP39:%.*]] = and i16 [[TMP6]], 1 +// CHECK6-NEXT: [[TMP40:%.*]] = icmp eq i16 [[TMP39]], 0 +// CHECK6-NEXT: [[TMP41:%.*]] = and i1 [[TMP38]], [[TMP40]] +// CHECK6-NEXT: [[TMP42:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK6-NEXT: [[TMP43:%.*]] = and i1 [[TMP41]], [[TMP42]] +// CHECK6-NEXT: [[TMP44:%.*]] = or i1 [[TMP34]], [[TMP37]] +// CHECK6-NEXT: [[TMP45:%.*]] = or i1 [[TMP44]], [[TMP43]] +// CHECK6-NEXT: br i1 [[TMP45]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK6: then: +// CHECK6-NEXT: [[TMP46:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* +// CHECK6-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK6-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP46]], i8* [[TMP47]]) #[[ATTR3]] +// CHECK6-NEXT: br label [[IFCONT:%.*]] +// CHECK6: else: +// CHECK6-NEXT: br label [[IFCONT]] +// CHECK6: ifcont: +// CHECK6-NEXT: [[TMP48:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK6-NEXT: [[TMP49:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK6-NEXT: [[TMP50:%.*]] = and i1 [[TMP48]], [[TMP49]] +// CHECK6-NEXT: br i1 [[TMP50]], label [[THEN6:%.*]], label [[ELSE7:%.*]] +// CHECK6: then6: +// CHECK6-NEXT: [[TMP51:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP52:%.*]] = load i8*, i8** [[TMP51]], align 4 +// CHECK6-NEXT: [[TMP53:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP54:%.*]] = load i8*, i8** [[TMP53]], align 4 +// CHECK6-NEXT: [[TMP55:%.*]] = load i8, i8* [[TMP52]], align 1 +// CHECK6-NEXT: store i8 [[TMP55]], i8* [[TMP54]], align 1 +// CHECK6-NEXT: [[TMP56:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP57:%.*]] = load i8*, i8** [[TMP56]], align 4 +// CHECK6-NEXT: [[TMP58:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP59:%.*]] = load i8*, i8** [[TMP58]], align 4 +// CHECK6-NEXT: [[TMP60:%.*]] = bitcast i8* [[TMP57]] to float* +// CHECK6-NEXT: [[TMP61:%.*]] = bitcast i8* [[TMP59]] to float* +// CHECK6-NEXT: [[TMP62:%.*]] = load float, float* [[TMP60]], align 4 +// CHECK6-NEXT: store float [[TMP62]], float* [[TMP61]], align 4 +// CHECK6-NEXT: br label [[IFCONT8:%.*]] +// CHECK6: else7: +// CHECK6-NEXT: br label [[IFCONT8]] +// CHECK6: ifcont8: +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func4 +// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK6-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK6-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK6-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK6-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK6-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK6-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK6: then: +// CHECK6-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 +// CHECK6-NEXT: [[TMP7:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK6-NEXT: [[TMP8:%.*]] = bitcast i32 addrspace(3)* [[TMP7]] to i8 addrspace(3)* +// CHECK6-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP6]], align 1 +// CHECK6-NEXT: store volatile i8 [[TMP9]], i8 addrspace(3)* [[TMP8]], align 1 +// CHECK6-NEXT: br label [[IFCONT:%.*]] +// CHECK6: else: +// CHECK6-NEXT: br label [[IFCONT]] +// CHECK6: ifcont: +// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] +// CHECK6-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK6: then4: +// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK6-NEXT: [[TMP12:%.*]] = bitcast i32 addrspace(3)* [[TMP11]] to i8 addrspace(3)* +// CHECK6-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 +// CHECK6-NEXT: [[TMP15:%.*]] = load volatile i8, i8 addrspace(3)* [[TMP12]], align 1 +// CHECK6-NEXT: store i8 [[TMP15]], i8* [[TMP14]], align 1 +// CHECK6-NEXT: br label [[IFCONT6:%.*]] +// CHECK6: else5: +// CHECK6-NEXT: br label [[IFCONT6]] +// CHECK6: ifcont6: +// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK6-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK6-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] +// CHECK6: then8: +// CHECK6-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 +// CHECK6-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i32* +// CHECK6-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK6-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP18]], align 4 +// CHECK6-NEXT: store volatile i32 [[TMP20]], i32 addrspace(3)* [[TMP19]], align 4 +// CHECK6-NEXT: br label [[IFCONT10:%.*]] +// CHECK6: else9: +// CHECK6-NEXT: br label [[IFCONT10]] +// CHECK6: ifcont10: +// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK6-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP21]] +// CHECK6-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] +// CHECK6: then12: +// CHECK6-NEXT: [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK6-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP24:%.*]] = load i8*, i8** [[TMP23]], align 4 +// CHECK6-NEXT: [[TMP25:%.*]] = bitcast i8* [[TMP24]] to i32* +// CHECK6-NEXT: [[TMP26:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP22]], align 4 +// CHECK6-NEXT: store i32 [[TMP26]], i32* [[TMP25]], align 4 +// CHECK6-NEXT: br label [[IFCONT14:%.*]] +// CHECK6: else13: +// CHECK6-NEXT: br label [[IFCONT14]] +// CHECK6: ifcont14: +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func5 +// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK6-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.3* +// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK6-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP7]] +// CHECK6-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]], align 1 +// CHECK6-NEXT: store i8 [[TMP11]], i8* [[TMP10]], align 128 +// CHECK6-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 +// CHECK6-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float* +// CHECK6-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP15:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP7]] +// CHECK6-NEXT: [[TMP16:%.*]] = load float, float* [[TMP14]], align 4 +// CHECK6-NEXT: store float [[TMP16]], float* [[TMP15]], align 128 +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func6 +// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.3* +// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK6-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP5]] +// CHECK6-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK6-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP5]] +// CHECK6-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8* +// CHECK6-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 4 +// CHECK6-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK6-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK6-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP11]], i8* [[TMP12]]) #[[ATTR3]] +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func7 +// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK6-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.3* +// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK6-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP7]] +// CHECK6-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP10]], align 128 +// CHECK6-NEXT: store i8 [[TMP11]], i8* [[TMP9]], align 1 +// CHECK6-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 +// CHECK6-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float* +// CHECK6-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP15:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP7]] +// CHECK6-NEXT: [[TMP16:%.*]] = load float, float* [[TMP15]], align 128 +// CHECK6-NEXT: store float [[TMP16]], float* [[TMP14]], align 4 +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func8 +// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.3* +// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK6-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP5]] +// CHECK6-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK6-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP5]] +// CHECK6-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8* +// CHECK6-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 4 +// CHECK6-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK6-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK6-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP12]], i8* [[TMP11]]) #[[ATTR3]] +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l36 +// CHECK6-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[B]], i32* [[B_ADDR]], align 4 +// CHECK6-NEXT: [[CONV:%.*]] = bitcast i32* [[B_ADDR]] to i16* +// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) +// CHECK6-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() +// CHECK6-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK6: .execute: +// CHECK6-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK6-NEXT: store i32 [[TMP0]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK6-NEXT: call void @__omp_outlined__9(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[A_ADDR]], i16* [[CONV]]) #[[ATTR3]] +// CHECK6-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK6: .omp.deinit: +// CHECK6-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) +// CHECK6-NEXT: br label [[DOTEXIT:%.*]] +// CHECK6: .exit: +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__9 +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 4 +// CHECK6-NEXT: [[A1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[B2:%.*]] = alloca i16, align 2 +// CHECK6-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4 +// CHECK6-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK6-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK6-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[A1]], align 4 +// CHECK6-NEXT: store i16 -32768, i16* [[B2]], align 2 +// CHECK6-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP3:%.*]] = bitcast i32* [[A1]] to i8* +// CHECK6-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP5:%.*]] = bitcast i16* [[B2]] to i8* +// CHECK6-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*)* @__omp_outlined__10 to i8*), i8* null, i8** [[TMP8]], i32 2) +// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP10:%.*]] = bitcast i32* [[A1]] to i8* +// CHECK6-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP12:%.*]] = bitcast i16* [[B2]] to i8* +// CHECK6-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK6-NEXT: [[TMP13:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK6-NEXT: [[TMP14:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 +// CHECK6-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i8* [[TMP14]], i32 2048, i8* [[TMP13]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func15, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func16, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func17, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func18, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func19, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func20) +// CHECK6-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP15]], 1 +// CHECK6-NEXT: br i1 [[TMP16]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK6: .omp.reduction.then: +// CHECK6-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK6-NEXT: [[TMP18:%.*]] = load i32, i32* [[A1]], align 4 +// CHECK6-NEXT: [[OR:%.*]] = or i32 [[TMP17]], [[TMP18]] +// CHECK6-NEXT: store i32 [[OR]], i32* [[TMP0]], align 4 +// CHECK6-NEXT: [[TMP19:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK6-NEXT: [[CONV:%.*]] = sext i16 [[TMP19]] to i32 +// CHECK6-NEXT: [[TMP20:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK6-NEXT: [[CONV3:%.*]] = sext i16 [[TMP20]] to i32 +// CHECK6-NEXT: [[CMP:%.*]] = icmp sgt i32 [[CONV]], [[CONV3]] +// CHECK6-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK6: cond.true: +// CHECK6-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK6-NEXT: br label [[COND_END:%.*]] +// CHECK6: cond.false: +// CHECK6-NEXT: [[TMP22:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK6-NEXT: br label [[COND_END]] +// CHECK6: cond.end: +// CHECK6-NEXT: [[COND:%.*]] = phi i16 [ [[TMP21]], [[COND_TRUE]] ], [ [[TMP22]], [[COND_FALSE]] ] +// CHECK6-NEXT: store i16 [[COND]], i16* [[TMP1]], align 2 +// CHECK6-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP7]]) +// CHECK6-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK6: .omp.reduction.done: +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__10 +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 4 +// CHECK6-NEXT: [[A1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[B2:%.*]] = alloca i16, align 2 +// CHECK6-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK6-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK6-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[A1]], align 4 +// CHECK6-NEXT: store i16 -32768, i16* [[B2]], align 2 +// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[A1]], align 4 +// CHECK6-NEXT: [[OR:%.*]] = or i32 [[TMP2]], 1 +// CHECK6-NEXT: store i32 [[OR]], i32* [[A1]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK6-NEXT: [[CONV:%.*]] = sext i16 [[TMP3]] to i32 +// CHECK6-NEXT: [[CMP:%.*]] = icmp sgt i32 99, [[CONV]] +// CHECK6-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK6: cond.true: +// CHECK6-NEXT: br label [[COND_END:%.*]] +// CHECK6: cond.false: +// CHECK6-NEXT: [[TMP4:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK6-NEXT: [[CONV3:%.*]] = sext i16 [[TMP4]] to i32 +// CHECK6-NEXT: br label [[COND_END]] +// CHECK6: cond.end: +// CHECK6-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[CONV3]], [[COND_FALSE]] ] +// CHECK6-NEXT: [[CONV4:%.*]] = trunc i32 [[COND]] to i16 +// CHECK6-NEXT: store i16 [[CONV4]], i16* [[B2]], align 2 +// CHECK6-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK6-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP8:%.*]] = bitcast i32* [[A1]] to i8* +// CHECK6-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 4 +// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP10:%.*]] = bitcast i16* [[B2]] to i8* +// CHECK6-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK6-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK6-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP6]], i32 2, i32 8, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func12, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func13) +// CHECK6-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP12]], 1 +// CHECK6-NEXT: br i1 [[TMP13]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK6: .omp.reduction.then: +// CHECK6-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK6-NEXT: [[TMP15:%.*]] = load i32, i32* [[A1]], align 4 +// CHECK6-NEXT: [[OR5:%.*]] = or i32 [[TMP14]], [[TMP15]] +// CHECK6-NEXT: store i32 [[OR5]], i32* [[TMP0]], align 4 +// CHECK6-NEXT: [[TMP16:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK6-NEXT: [[CONV6:%.*]] = sext i16 [[TMP16]] to i32 +// CHECK6-NEXT: [[TMP17:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK6-NEXT: [[CONV7:%.*]] = sext i16 [[TMP17]] to i32 +// CHECK6-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[CONV6]], [[CONV7]] +// CHECK6-NEXT: br i1 [[CMP8]], label [[COND_TRUE9:%.*]], label [[COND_FALSE10:%.*]] +// CHECK6: cond.true9: +// CHECK6-NEXT: [[TMP18:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK6-NEXT: br label [[COND_END11:%.*]] +// CHECK6: cond.false10: +// CHECK6-NEXT: [[TMP19:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK6-NEXT: br label [[COND_END11]] +// CHECK6: cond.end11: +// CHECK6-NEXT: [[COND12:%.*]] = phi i16 [ [[TMP18]], [[COND_TRUE9]] ], [ [[TMP19]], [[COND_FALSE10]] ] +// CHECK6-NEXT: store i16 [[COND12]], i16* [[TMP1]], align 2 +// CHECK6-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP6]]) +// CHECK6-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK6: .omp.reduction.done: +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func12 +// CHECK6-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK6-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK6-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK6-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 +// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK6-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK6-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK6-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* +// CHECK6-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK6-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK6-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 +// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* +// CHECK6-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 +// CHECK6-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* +// CHECK6-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 +// CHECK6-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK6-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK6-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]]) +// CHECK6-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 +// CHECK6-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 +// CHECK6-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 +// CHECK6-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* +// CHECK6-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 4 +// CHECK6-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 4 +// CHECK6-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16* +// CHECK6-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 +// CHECK6-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8* +// CHECK6-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2 +// CHECK6-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32 +// CHECK6-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK6-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 +// CHECK6-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) +// CHECK6-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 +// CHECK6-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 +// CHECK6-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 +// CHECK6-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1 +// CHECK6-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* +// CHECK6-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 4 +// CHECK6-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK6-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK6-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK6-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]] +// CHECK6-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK6-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1 +// CHECK6-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0 +// CHECK6-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]] +// CHECK6-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK6-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]] +// CHECK6-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]] +// CHECK6-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]] +// CHECK6-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK6: then: +// CHECK6-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* +// CHECK6-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK6-NEXT: call void @"_omp$reduction$reduction_func11"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]] +// CHECK6-NEXT: br label [[IFCONT:%.*]] +// CHECK6: else: +// CHECK6-NEXT: br label [[IFCONT]] +// CHECK6: ifcont: +// CHECK6-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK6-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK6-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]] +// CHECK6-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]] +// CHECK6: then6: +// CHECK6-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 4 +// CHECK6-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4 +// CHECK6-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32* +// CHECK6-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32* +// CHECK6-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4 +// CHECK6-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4 +// CHECK6-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 4 +// CHECK6-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4 +// CHECK6-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16* +// CHECK6-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16* +// CHECK6-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2 +// CHECK6-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2 +// CHECK6-NEXT: br label [[IFCONT8:%.*]] +// CHECK6: else7: +// CHECK6-NEXT: br label [[IFCONT8]] +// CHECK6: ifcont8: +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func13 +// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK6-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK6-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK6-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK6-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4:[0-9]+]], i32 [[TMP2]]) +// CHECK6-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK6-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK6: then: +// CHECK6-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 +// CHECK6-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* +// CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK6-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4 +// CHECK6-NEXT: br label [[IFCONT:%.*]] +// CHECK6: else: +// CHECK6-NEXT: br label [[IFCONT]] +// CHECK6: ifcont: +// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] +// CHECK6-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK6: then4: +// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK6-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 +// CHECK6-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32* +// CHECK6-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4 +// CHECK6-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4 +// CHECK6-NEXT: br label [[IFCONT6:%.*]] +// CHECK6: else5: +// CHECK6-NEXT: br label [[IFCONT6]] +// CHECK6: ifcont6: +// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK6-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK6-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] +// CHECK6: then8: +// CHECK6-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 +// CHECK6-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16* +// CHECK6-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK6-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)* +// CHECK6-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2 +// CHECK6-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2 +// CHECK6-NEXT: br label [[IFCONT10:%.*]] +// CHECK6: else9: +// CHECK6-NEXT: br label [[IFCONT10]] +// CHECK6: ifcont10: +// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK6-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]] +// CHECK6-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] +// CHECK6: then12: +// CHECK6-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK6-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)* +// CHECK6-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 4 +// CHECK6-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16* +// CHECK6-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2 +// CHECK6-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2 +// CHECK6-NEXT: br label [[IFCONT14:%.*]] +// CHECK6: else13: +// CHECK6-NEXT: br label [[IFCONT14]] +// CHECK6: ifcont14: +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func15 +// CHECK6-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK6-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK6-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK6-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 +// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK6-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK6-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK6-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* +// CHECK6-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK6-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK6-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 +// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* +// CHECK6-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 +// CHECK6-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* +// CHECK6-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 +// CHECK6-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK6-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK6-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]]) +// CHECK6-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 +// CHECK6-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 +// CHECK6-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 +// CHECK6-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* +// CHECK6-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 4 +// CHECK6-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 4 +// CHECK6-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16* +// CHECK6-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 +// CHECK6-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8* +// CHECK6-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2 +// CHECK6-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32 +// CHECK6-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK6-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 +// CHECK6-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) +// CHECK6-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 +// CHECK6-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 +// CHECK6-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 +// CHECK6-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1 +// CHECK6-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* +// CHECK6-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 4 +// CHECK6-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK6-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK6-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK6-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]] +// CHECK6-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK6-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1 +// CHECK6-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0 +// CHECK6-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]] +// CHECK6-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK6-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]] +// CHECK6-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]] +// CHECK6-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]] +// CHECK6-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK6: then: +// CHECK6-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* +// CHECK6-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK6-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]] +// CHECK6-NEXT: br label [[IFCONT:%.*]] +// CHECK6: else: +// CHECK6-NEXT: br label [[IFCONT]] +// CHECK6: ifcont: +// CHECK6-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK6-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK6-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]] +// CHECK6-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]] +// CHECK6: then6: +// CHECK6-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 4 +// CHECK6-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4 +// CHECK6-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32* +// CHECK6-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32* +// CHECK6-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4 +// CHECK6-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4 +// CHECK6-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 4 +// CHECK6-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4 +// CHECK6-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16* +// CHECK6-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16* +// CHECK6-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2 +// CHECK6-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2 +// CHECK6-NEXT: br label [[IFCONT8:%.*]] +// CHECK6: else7: +// CHECK6-NEXT: br label [[IFCONT8]] +// CHECK6: ifcont8: +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func16 +// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK6-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK6-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK6-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK6-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK6-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK6-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK6: then: +// CHECK6-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 +// CHECK6-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* +// CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK6-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4 +// CHECK6-NEXT: br label [[IFCONT:%.*]] +// CHECK6: else: +// CHECK6-NEXT: br label [[IFCONT]] +// CHECK6: ifcont: +// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] +// CHECK6-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK6: then4: +// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK6-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 +// CHECK6-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32* +// CHECK6-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4 +// CHECK6-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4 +// CHECK6-NEXT: br label [[IFCONT6:%.*]] +// CHECK6: else5: +// CHECK6-NEXT: br label [[IFCONT6]] +// CHECK6: ifcont6: +// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK6-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK6-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] +// CHECK6: then8: +// CHECK6-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 +// CHECK6-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16* +// CHECK6-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK6-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)* +// CHECK6-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2 +// CHECK6-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2 +// CHECK6-NEXT: br label [[IFCONT10:%.*]] +// CHECK6: else9: +// CHECK6-NEXT: br label [[IFCONT10]] +// CHECK6: ifcont10: +// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK6-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]] +// CHECK6-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] +// CHECK6: then12: +// CHECK6-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK6-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)* +// CHECK6-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 4 +// CHECK6-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16* +// CHECK6-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2 +// CHECK6-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2 +// CHECK6-NEXT: br label [[IFCONT14:%.*]] +// CHECK6: else13: +// CHECK6-NEXT: br label [[IFCONT14]] +// CHECK6: ifcont14: +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func17 +// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK6-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.4* +// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK6-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* +// CHECK6-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP7]] +// CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK6-NEXT: store i32 [[TMP12]], i32* [[TMP11]], align 128 +// CHECK6-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 +// CHECK6-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16* +// CHECK6-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP7]] +// CHECK6-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP15]], align 2 +// CHECK6-NEXT: store i16 [[TMP17]], i16* [[TMP16]], align 128 +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func18 +// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.4* +// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK6-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP5]] +// CHECK6-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* +// CHECK6-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 +// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK6-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP5]] +// CHECK6-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8* +// CHECK6-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 4 +// CHECK6-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK6-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK6-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP12]], i8* [[TMP13]]) #[[ATTR3]] +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func19 +// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK6-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.4* +// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK6-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* +// CHECK6-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP7]] +// CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 128 +// CHECK6-NEXT: store i32 [[TMP12]], i32* [[TMP10]], align 4 +// CHECK6-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 +// CHECK6-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16* +// CHECK6-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP7]] +// CHECK6-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP16]], align 128 +// CHECK6-NEXT: store i16 [[TMP17]], i16* [[TMP15]], align 2 +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func20 +// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.4* +// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK6-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP5]] +// CHECK6-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* +// CHECK6-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 +// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK6-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP5]] +// CHECK6-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8* +// CHECK6-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 4 +// CHECK6-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK6-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK6-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP13]], i8* [[TMP12]]) #[[ATTR3]] +// CHECK6-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l21_worker +// CHECK1-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK1-NEXT: store i8* null, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK1: .await.work: +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK1-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK1-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK1-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK1: .select.workers: +// CHECK1-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK1-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK1: .execute.parallel: +// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK1-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK1: .terminate.parallel: +// CHECK1-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK1-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK1: .barrier.parallel: +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void // // -// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l36 -// CHECK4-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK4-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 -// CHECK4-NEXT: store i32 [[B]], i32* [[B_ADDR]], align 4 -// CHECK4-NEXT: [[CONV:%.*]] = bitcast i32* [[B_ADDR]] to i16* -// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK4-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() -// CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK4: .execute: -// CHECK4-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) -// CHECK4-NEXT: store i32 [[TMP0]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK4-NEXT: call void @__omp_outlined__11(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[A_ADDR]], i16* [[CONV]]) #[[ATTR3]] -// CHECK4-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK4: .omp.deinit: -// CHECK4-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) -// CHECK4-NEXT: br label [[DOTEXIT:%.*]] -// CHECK4: .exit: -// CHECK4-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l21 +// CHECK1-SAME: (i64 [[E:%.*]]) #[[ATTR1:[0-9]+]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[E_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store i64 [[E]], i64* [[E_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[E_ADDR]] to double* +// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG8:![0-9]+]] +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG9:![0-9]+]] +// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10:![0-9]+]] +// CHECK1-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK1-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK1-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK1: .worker: +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l21_worker() #[[ATTR3:[0-9]+]] +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .mastercheck: +// CHECK1-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG8]] +// CHECK1-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG9]] +// CHECK1-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10]] +// CHECK1-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK1-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK1-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK1-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK1-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK1-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK1: .master: +// CHECK1-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG9]] +// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10]] +// CHECK1-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK1-NEXT: [[TMP5:%.*]] = load double, double* [[CONV]], align 8 +// CHECK1-NEXT: [[E7:%.*]] = call i8* @__kmpc_alloc_shared(i64 8) +// CHECK1-NEXT: [[E_ON_STACK:%.*]] = bitcast i8* [[E7]] to double* +// CHECK1-NEXT: store double [[TMP5]], double* [[E_ON_STACK]], align 8 +// CHECK1-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK1-NEXT: store i32 [[TMP6]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK1-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], double* [[E_ON_STACK]]) #[[ATTR3]] +// CHECK1-NEXT: call void @__kmpc_free_shared(i8* [[E7]]) +// CHECK1-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK1: .termination.notifier: +// CHECK1-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: br label [[DOTEXIT]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void // // -// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__11 -// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 4 -// CHECK4-NEXT: [[A1:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[B2:%.*]] = alloca i16, align 2 -// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4 -// CHECK4-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 -// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK4-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 -// CHECK4-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 4 -// CHECK4-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 -// CHECK4-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4 -// CHECK4-NEXT: store i32 0, i32* [[A1]], align 4 -// CHECK4-NEXT: store i16 -32768, i16* [[B2]], align 2 -// CHECK4-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK4-NEXT: [[TMP3:%.*]] = bitcast i32* [[A1]] to i8* -// CHECK4-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 4 -// CHECK4-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK4-NEXT: [[TMP5:%.*]] = bitcast i16* [[B2]] to i8* -// CHECK4-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 4 -// CHECK4-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 -// CHECK4-NEXT: [[TMP8:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*)* @__omp_outlined__12 to i8*), i8* null, i8** [[TMP8]], i32 2) -// CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK4-NEXT: [[TMP10:%.*]] = bitcast i32* [[A1]] to i8* -// CHECK4-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 -// CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 -// CHECK4-NEXT: [[TMP12:%.*]] = bitcast i16* [[B2]] to i8* -// CHECK4-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 -// CHECK4-NEXT: [[TMP13:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK4-NEXT: [[TMP14:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 -// CHECK4-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i8* [[TMP14]], i32 2048, i8* [[TMP13]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func17, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func18, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func19, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func20, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func21, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func22) -// CHECK4-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP15]], 1 -// CHECK4-NEXT: br i1 [[TMP16]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] -// CHECK4: .omp.reduction.then: -// CHECK4-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP0]], align 4 -// CHECK4-NEXT: [[TMP18:%.*]] = load i32, i32* [[A1]], align 4 -// CHECK4-NEXT: [[OR:%.*]] = or i32 [[TMP17]], [[TMP18]] -// CHECK4-NEXT: store i32 [[OR]], i32* [[TMP0]], align 4 -// CHECK4-NEXT: [[TMP19:%.*]] = load i16, i16* [[TMP1]], align 2 -// CHECK4-NEXT: [[CONV:%.*]] = sext i16 [[TMP19]] to i32 -// CHECK4-NEXT: [[TMP20:%.*]] = load i16, i16* [[B2]], align 2 -// CHECK4-NEXT: [[CONV3:%.*]] = sext i16 [[TMP20]] to i32 -// CHECK4-NEXT: [[CMP:%.*]] = icmp sgt i32 [[CONV]], [[CONV3]] -// CHECK4-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK4: cond.true: -// CHECK4-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP1]], align 2 -// CHECK4-NEXT: br label [[COND_END:%.*]] -// CHECK4: cond.false: -// CHECK4-NEXT: [[TMP22:%.*]] = load i16, i16* [[B2]], align 2 -// CHECK4-NEXT: br label [[COND_END]] -// CHECK4: cond.end: -// CHECK4-NEXT: [[COND:%.*]] = phi i16 [ [[TMP21]], [[COND_TRUE]] ], [ [[TMP22]], [[COND_FALSE]] ] -// CHECK4-NEXT: store i16 [[COND]], i16* [[TMP1]], align 2 -// CHECK4-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP7]]) -// CHECK4-NEXT: br label [[DOTOMP_REDUCTION_DONE]] -// CHECK4: .omp.reduction.done: -// CHECK4-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[E_ADDR:%.*]] = alloca double*, align 8 +// CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 8 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store double* [[E]], double** [[E_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 8 +// CHECK1-NEXT: [[E1:%.*]] = call i8* @__kmpc_alloc_shared(i64 8) +// CHECK1-NEXT: [[E_ON_STACK:%.*]] = bitcast i8* [[E1]] to double* +// CHECK1-NEXT: store double 0.000000e+00, double* [[E_ON_STACK]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load double, double* [[E_ON_STACK]], align 8 +// CHECK1-NEXT: [[ADD:%.*]] = fadd double [[TMP1]], 5.000000e+00 +// CHECK1-NEXT: store double [[ADD]], double* [[E_ON_STACK]], align 8 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4 +// CHECK1-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP5:%.*]] = bitcast double* [[E_ON_STACK]] to i8* +// CHECK1-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 8 +// CHECK1-NEXT: [[TMP6:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK1-NEXT: [[TMP7:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 8 +// CHECK1-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP3]], i8* [[TMP7]], i32 1024, i8* [[TMP6]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func) +// CHECK1-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 1 +// CHECK1-NEXT: br i1 [[TMP9]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK1: .omp.reduction.then: +// CHECK1-NEXT: [[TMP10:%.*]] = load double, double* [[TMP0]], align 8 +// CHECK1-NEXT: [[TMP11:%.*]] = load double, double* [[E_ON_STACK]], align 8 +// CHECK1-NEXT: [[ADD2:%.*]] = fadd double [[TMP10]], [[TMP11]] +// CHECK1-NEXT: store double [[ADD2]], double* [[TMP0]], align 8 +// CHECK1-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP3]]) +// CHECK1-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK1: .omp.reduction.done: +// CHECK1-NEXT: call void @__kmpc_free_shared(i8* [[E1]]) +// CHECK1-NEXT: ret void // // -// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__12 -// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 4 -// CHECK4-NEXT: [[A1:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[B2:%.*]] = alloca i16, align 2 -// CHECK4-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 -// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK4-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 -// CHECK4-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 4 -// CHECK4-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 -// CHECK4-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4 -// CHECK4-NEXT: store i32 0, i32* [[A1]], align 4 -// CHECK4-NEXT: store i16 -32768, i16* [[B2]], align 2 -// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[A1]], align 4 -// CHECK4-NEXT: [[OR:%.*]] = or i32 [[TMP2]], 1 -// CHECK4-NEXT: store i32 [[OR]], i32* [[A1]], align 4 -// CHECK4-NEXT: [[TMP3:%.*]] = load i16, i16* [[B2]], align 2 -// CHECK4-NEXT: [[CONV:%.*]] = sext i16 [[TMP3]] to i32 -// CHECK4-NEXT: [[CMP:%.*]] = icmp sgt i32 99, [[CONV]] -// CHECK4-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK4: cond.true: -// CHECK4-NEXT: br label [[COND_END:%.*]] -// CHECK4: cond.false: -// CHECK4-NEXT: [[TMP4:%.*]] = load i16, i16* [[B2]], align 2 -// CHECK4-NEXT: [[CONV3:%.*]] = sext i16 [[TMP4]] to i32 -// CHECK4-NEXT: br label [[COND_END]] -// CHECK4: cond.end: -// CHECK4-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[CONV3]], [[COND_FALSE]] ] -// CHECK4-NEXT: [[CONV4:%.*]] = trunc i32 [[COND]] to i16 -// CHECK4-NEXT: store i16 [[CONV4]], i16* [[B2]], align 2 -// CHECK4-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 -// CHECK4-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK4-NEXT: [[TMP8:%.*]] = bitcast i32* [[A1]] to i8* -// CHECK4-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 4 -// CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 -// CHECK4-NEXT: [[TMP10:%.*]] = bitcast i16* [[B2]] to i8* -// CHECK4-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 -// CHECK4-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK4-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP6]], i32 2, i32 8, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func14, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func15) -// CHECK4-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP12]], 1 -// CHECK4-NEXT: br i1 [[TMP13]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] -// CHECK4: .omp.reduction.then: -// CHECK4-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP0]], align 4 -// CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[A1]], align 4 -// CHECK4-NEXT: [[OR5:%.*]] = or i32 [[TMP14]], [[TMP15]] -// CHECK4-NEXT: store i32 [[OR5]], i32* [[TMP0]], align 4 -// CHECK4-NEXT: [[TMP16:%.*]] = load i16, i16* [[TMP1]], align 2 -// CHECK4-NEXT: [[CONV6:%.*]] = sext i16 [[TMP16]] to i32 -// CHECK4-NEXT: [[TMP17:%.*]] = load i16, i16* [[B2]], align 2 -// CHECK4-NEXT: [[CONV7:%.*]] = sext i16 [[TMP17]] to i32 -// CHECK4-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[CONV6]], [[CONV7]] -// CHECK4-NEXT: br i1 [[CMP8]], label [[COND_TRUE9:%.*]], label [[COND_FALSE10:%.*]] -// CHECK4: cond.true9: -// CHECK4-NEXT: [[TMP18:%.*]] = load i16, i16* [[TMP1]], align 2 -// CHECK4-NEXT: br label [[COND_END11:%.*]] -// CHECK4: cond.false10: -// CHECK4-NEXT: [[TMP19:%.*]] = load i16, i16* [[B2]], align 2 -// CHECK4-NEXT: br label [[COND_END11]] -// CHECK4: cond.end11: -// CHECK4-NEXT: [[COND12:%.*]] = phi i16 [ [[TMP18]], [[COND_TRUE9]] ], [ [[TMP19]], [[COND_FALSE10]] ] -// CHECK4-NEXT: store i16 [[COND12]], i16* [[TMP1]], align 2 -// CHECK4-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP6]]) -// CHECK4-NEXT: br label [[DOTOMP_REDUCTION_DONE]] -// CHECK4: .omp.reduction.done: -// CHECK4-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func +// CHECK1-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x i8*], align 8 +// CHECK1-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca double, align 8 +// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK1-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK1-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK1-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [1 x i8*]* +// CHECK1-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK1-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK1-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 8 +// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to double* +// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr double, double* [[TMP12]], i64 1 +// CHECK1-NEXT: [[TMP14:%.*]] = bitcast double* [[TMP13]] to i8* +// CHECK1-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP12]] to i64* +// CHECK1-NEXT: [[TMP16:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i64* +// CHECK1-NEXT: [[TMP17:%.*]] = load i64, i64* [[TMP15]], align 8 +// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10]] +// CHECK1-NEXT: [[TMP18:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK1-NEXT: [[TMP19:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP17]], i16 [[TMP7]], i16 [[TMP18]]) +// CHECK1-NEXT: store i64 [[TMP19]], i64* [[TMP16]], align 8 +// CHECK1-NEXT: [[TMP20:%.*]] = getelementptr i64, i64* [[TMP15]], i64 1 +// CHECK1-NEXT: [[TMP21:%.*]] = getelementptr i64, i64* [[TMP16]], i64 1 +// CHECK1-NEXT: [[TMP22:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i8* +// CHECK1-NEXT: store i8* [[TMP22]], i8** [[TMP11]], align 8 +// CHECK1-NEXT: [[TMP23:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK1-NEXT: [[TMP24:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK1-NEXT: [[TMP25:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK1-NEXT: [[TMP26:%.*]] = and i1 [[TMP24]], [[TMP25]] +// CHECK1-NEXT: [[TMP27:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK1-NEXT: [[TMP28:%.*]] = and i16 [[TMP6]], 1 +// CHECK1-NEXT: [[TMP29:%.*]] = icmp eq i16 [[TMP28]], 0 +// CHECK1-NEXT: [[TMP30:%.*]] = and i1 [[TMP27]], [[TMP29]] +// CHECK1-NEXT: [[TMP31:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK1-NEXT: [[TMP32:%.*]] = and i1 [[TMP30]], [[TMP31]] +// CHECK1-NEXT: [[TMP33:%.*]] = or i1 [[TMP23]], [[TMP26]] +// CHECK1-NEXT: [[TMP34:%.*]] = or i1 [[TMP33]], [[TMP32]] +// CHECK1-NEXT: br i1 [[TMP34]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK1: then: +// CHECK1-NEXT: [[TMP35:%.*]] = bitcast [1 x i8*]* [[TMP5]] to i8* +// CHECK1-NEXT: [[TMP36:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK1-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP35]], i8* [[TMP36]]) #[[ATTR3]] +// CHECK1-NEXT: br label [[IFCONT:%.*]] +// CHECK1: else: +// CHECK1-NEXT: br label [[IFCONT]] +// CHECK1: ifcont: +// CHECK1-NEXT: [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK1-NEXT: [[TMP38:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK1-NEXT: [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]] +// CHECK1-NEXT: br i1 [[TMP39]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK1: then4: +// CHECK1-NEXT: [[TMP40:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP41:%.*]] = load i8*, i8** [[TMP40]], align 8 +// CHECK1-NEXT: [[TMP42:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP43:%.*]] = load i8*, i8** [[TMP42]], align 8 +// CHECK1-NEXT: [[TMP44:%.*]] = bitcast i8* [[TMP41]] to double* +// CHECK1-NEXT: [[TMP45:%.*]] = bitcast i8* [[TMP43]] to double* +// CHECK1-NEXT: [[TMP46:%.*]] = load double, double* [[TMP44]], align 8 +// CHECK1-NEXT: store double [[TMP46]], double* [[TMP45]], align 8 +// CHECK1-NEXT: br label [[IFCONT6:%.*]] +// CHECK1: else5: +// CHECK1-NEXT: br label [[IFCONT6]] +// CHECK1: ifcont6: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func +// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCNT_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG8]] +// CHECK1-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG8]] +// CHECK1-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK1-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG8]] +// CHECK1-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* +// CHECK1-NEXT: store i32 0, i32* [[DOTCNT_ADDR]], align 4 +// CHECK1-NEXT: br label [[PRECOND:%.*]] +// CHECK1: precond: +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCNT_ADDR]], align 4 +// CHECK1-NEXT: [[TMP6:%.*]] = icmp ult i32 [[TMP5]], 2 +// CHECK1-NEXT: br i1 [[TMP6]], label [[BODY:%.*]], label [[EXIT:%.*]] +// CHECK1: body: +// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP2]]) +// CHECK1-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK1-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK1: then: +// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP8:%.*]] = load i8*, i8** [[TMP7]], align 8 +// CHECK1-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32* +// CHECK1-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[TMP9]], i32 [[TMP5]] +// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK1-NEXT: store volatile i32 [[TMP12]], i32 addrspace(3)* [[TMP11]], align 4 +// CHECK1-NEXT: br label [[IFCONT:%.*]] +// CHECK1: else: +// CHECK1-NEXT: br label [[IFCONT]] +// CHECK1: ifcont: +// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP13]] +// CHECK1-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK1: then4: +// CHECK1-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 8 +// CHECK1-NEXT: [[TMP17:%.*]] = bitcast i8* [[TMP16]] to i32* +// CHECK1-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP17]], i32 [[TMP5]] +// CHECK1-NEXT: [[TMP19:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4 +// CHECK1-NEXT: store i32 [[TMP19]], i32* [[TMP18]], align 4 +// CHECK1-NEXT: br label [[IFCONT6:%.*]] +// CHECK1: else5: +// CHECK1-NEXT: br label [[IFCONT6]] +// CHECK1: ifcont6: +// CHECK1-NEXT: [[TMP20:%.*]] = add nsw i32 [[TMP5]], 1 +// CHECK1-NEXT: store i32 [[TMP20]], i32* [[DOTCNT_ADDR]], align 4 +// CHECK1-NEXT: br label [[PRECOND]] +// CHECK1: exit: +// CHECK1-NEXT: ret void // // -// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func14 -// CHECK4-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 -// CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 -// CHECK4-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 -// CHECK4-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 -// CHECK4-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 -// CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK4-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 -// CHECK4-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 -// CHECK4-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 -// CHECK4-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK4-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* -// CHECK4-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 -// CHECK4-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 -// CHECK4-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 -// CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 -// CHECK4-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 -// CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 -// CHECK4-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* -// CHECK4-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 -// CHECK4-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* -// CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 -// CHECK4-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK4-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 -// CHECK4-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]]) -// CHECK4-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 -// CHECK4-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 -// CHECK4-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 -// CHECK4-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* -// CHECK4-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 4 -// CHECK4-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 -// CHECK4-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 4 -// CHECK4-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 -// CHECK4-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16* -// CHECK4-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 -// CHECK4-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8* -// CHECK4-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2 -// CHECK4-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32 -// CHECK4-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK4-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 -// CHECK4-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) -// CHECK4-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 -// CHECK4-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 -// CHECK4-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 -// CHECK4-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1 -// CHECK4-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* -// CHECK4-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 4 -// CHECK4-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0 -// CHECK4-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK4-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] -// CHECK4-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]] -// CHECK4-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2 -// CHECK4-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1 -// CHECK4-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0 -// CHECK4-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]] -// CHECK4-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0 -// CHECK4-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]] -// CHECK4-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]] -// CHECK4-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]] -// CHECK4-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK4: then: -// CHECK4-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* -// CHECK4-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* -// CHECK4-NEXT: call void @"_omp$reduction$reduction_func13"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]] -// CHECK4-NEXT: br label [[IFCONT:%.*]] -// CHECK4: else: -// CHECK4-NEXT: br label [[IFCONT]] -// CHECK4: ifcont: -// CHECK4-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK4-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] -// CHECK4-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]] -// CHECK4-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]] -// CHECK4: then6: -// CHECK4-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 -// CHECK4-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 4 -// CHECK4-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 -// CHECK4-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4 -// CHECK4-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32* -// CHECK4-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32* -// CHECK4-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4 -// CHECK4-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4 -// CHECK4-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 -// CHECK4-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 4 -// CHECK4-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 -// CHECK4-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4 -// CHECK4-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16* -// CHECK4-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16* -// CHECK4-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2 -// CHECK4-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2 -// CHECK4-NEXT: br label [[IFCONT8:%.*]] -// CHECK4: else7: -// CHECK4-NEXT: br label [[IFCONT8]] -// CHECK4: ifcont8: -// CHECK4-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func +// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* +// CHECK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty* +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8 +// CHECK1-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double* +// CHECK1-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP6]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP7]] +// CHECK1-NEXT: [[TMP12:%.*]] = load double, double* [[TMP10]], align 8 +// CHECK1-NEXT: store double [[TMP12]], double* [[TMP11]], align 128 +// CHECK1-NEXT: ret void // // -// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func15 -// CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK4-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK4-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK4-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 -// CHECK4-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK4-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 -// CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4:[0-9]+]], i32 [[TMP2]]) -// CHECK4-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK4-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK4: then: -// CHECK4-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK4-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 -// CHECK4-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* -// CHECK4-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK4-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4 -// CHECK4-NEXT: br label [[IFCONT:%.*]] -// CHECK4: else: -// CHECK4-NEXT: br label [[IFCONT]] -// CHECK4: ifcont: -// CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK4-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] -// CHECK4-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] -// CHECK4: then4: -// CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK4-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK4-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 -// CHECK4-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32* -// CHECK4-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4 -// CHECK4-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4 -// CHECK4-NEXT: br label [[IFCONT6:%.*]] -// CHECK4: else5: -// CHECK4-NEXT: br label [[IFCONT6]] -// CHECK4: ifcont6: -// CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK4-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK4-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] -// CHECK4: then8: -// CHECK4-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK4-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 -// CHECK4-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16* -// CHECK4-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK4-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)* -// CHECK4-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2 -// CHECK4-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2 -// CHECK4-NEXT: br label [[IFCONT10:%.*]] -// CHECK4: else9: -// CHECK4-NEXT: br label [[IFCONT10]] -// CHECK4: ifcont10: -// CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK4-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK4-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]] -// CHECK4-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] -// CHECK4: then12: -// CHECK4-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK4-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)* -// CHECK4-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK4-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 4 -// CHECK4-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16* -// CHECK4-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2 -// CHECK4-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2 -// CHECK4-NEXT: br label [[IFCONT14:%.*]] -// CHECK4: else13: -// CHECK4-NEXT: br label [[IFCONT14]] -// CHECK4: ifcont14: -// CHECK4-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func +// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 8 +// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty* +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 +// CHECK1-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP4]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP5]] +// CHECK1-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8* +// CHECK1-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 8 +// CHECK1-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK1-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 +// CHECK1-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP9]], i8* [[TMP10]]) #[[ATTR3]] +// CHECK1-NEXT: ret void // // -// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func17 -// CHECK4-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 -// CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 -// CHECK4-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 -// CHECK4-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 -// CHECK4-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 -// CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK4-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 -// CHECK4-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 -// CHECK4-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 -// CHECK4-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK4-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* -// CHECK4-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 -// CHECK4-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 -// CHECK4-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 -// CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 -// CHECK4-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 -// CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 -// CHECK4-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* -// CHECK4-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 -// CHECK4-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* -// CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 -// CHECK4-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK4-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 -// CHECK4-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]]) -// CHECK4-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 -// CHECK4-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 -// CHECK4-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 -// CHECK4-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* -// CHECK4-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 4 -// CHECK4-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 -// CHECK4-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 4 -// CHECK4-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 -// CHECK4-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16* -// CHECK4-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 -// CHECK4-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8* -// CHECK4-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2 -// CHECK4-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32 -// CHECK4-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK4-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 -// CHECK4-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) -// CHECK4-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 -// CHECK4-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 -// CHECK4-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 -// CHECK4-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1 -// CHECK4-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* -// CHECK4-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 4 -// CHECK4-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0 -// CHECK4-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK4-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] -// CHECK4-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]] -// CHECK4-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2 -// CHECK4-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1 -// CHECK4-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0 -// CHECK4-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]] -// CHECK4-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0 -// CHECK4-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]] -// CHECK4-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]] -// CHECK4-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]] -// CHECK4-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK4: then: -// CHECK4-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* -// CHECK4-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* -// CHECK4-NEXT: call void @"_omp$reduction$reduction_func16"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]] -// CHECK4-NEXT: br label [[IFCONT:%.*]] -// CHECK4: else: -// CHECK4-NEXT: br label [[IFCONT]] -// CHECK4: ifcont: -// CHECK4-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK4-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] -// CHECK4-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]] -// CHECK4-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]] -// CHECK4: then6: -// CHECK4-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 -// CHECK4-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 4 -// CHECK4-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 -// CHECK4-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4 -// CHECK4-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32* -// CHECK4-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32* -// CHECK4-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4 -// CHECK4-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4 -// CHECK4-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 -// CHECK4-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 4 -// CHECK4-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 -// CHECK4-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4 -// CHECK4-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16* -// CHECK4-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16* -// CHECK4-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2 -// CHECK4-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2 -// CHECK4-NEXT: br label [[IFCONT8:%.*]] -// CHECK4: else7: -// CHECK4-NEXT: br label [[IFCONT8]] -// CHECK4: ifcont8: -// CHECK4-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func +// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* +// CHECK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty* +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8 +// CHECK1-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double* +// CHECK1-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP6]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP7]] +// CHECK1-NEXT: [[TMP12:%.*]] = load double, double* [[TMP11]], align 128 +// CHECK1-NEXT: store double [[TMP12]], double* [[TMP10]], align 8 +// CHECK1-NEXT: ret void // // -// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func18 -// CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK4-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK4-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK4-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 -// CHECK4-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK4-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 -// CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK4-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK4-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK4: then: -// CHECK4-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK4-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 -// CHECK4-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* -// CHECK4-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK4-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4 -// CHECK4-NEXT: br label [[IFCONT:%.*]] -// CHECK4: else: -// CHECK4-NEXT: br label [[IFCONT]] -// CHECK4: ifcont: -// CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK4-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] -// CHECK4-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] -// CHECK4: then4: -// CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK4-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK4-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 -// CHECK4-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32* -// CHECK4-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4 -// CHECK4-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4 -// CHECK4-NEXT: br label [[IFCONT6:%.*]] -// CHECK4: else5: -// CHECK4-NEXT: br label [[IFCONT6]] -// CHECK4: ifcont6: -// CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK4-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK4-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] -// CHECK4: then8: -// CHECK4-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK4-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 -// CHECK4-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16* -// CHECK4-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK4-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)* -// CHECK4-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2 -// CHECK4-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2 -// CHECK4-NEXT: br label [[IFCONT10:%.*]] -// CHECK4: else9: -// CHECK4-NEXT: br label [[IFCONT10]] -// CHECK4: ifcont10: -// CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK4-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK4-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]] -// CHECK4-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] -// CHECK4: then12: -// CHECK4-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK4-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)* -// CHECK4-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK4-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 4 -// CHECK4-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16* -// CHECK4-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2 -// CHECK4-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2 -// CHECK4-NEXT: br label [[IFCONT14:%.*]] -// CHECK4: else13: -// CHECK4-NEXT: br label [[IFCONT14]] -// CHECK4: ifcont14: -// CHECK4-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func +// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 8 +// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty* +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 +// CHECK1-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP4]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP5]] +// CHECK1-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8* +// CHECK1-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 8 +// CHECK1-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK1-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 +// CHECK1-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP10]], i8* [[TMP9]]) #[[ATTR3]] +// CHECK1-NEXT: ret void // // -// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func19 -// CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK4-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK4-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK4-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.4* -// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK4-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK4-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 -// CHECK4-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* -// CHECK4-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 0 -// CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP7]] -// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 -// CHECK4-NEXT: store i32 [[TMP12]], i32* [[TMP11]], align 128 -// CHECK4-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK4-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 -// CHECK4-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16* -// CHECK4-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 1 -// CHECK4-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP7]] -// CHECK4-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP15]], align 2 -// CHECK4-NEXT: store i16 [[TMP17]], i16* [[TMP16]], align 128 -// CHECK4-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l27_worker +// CHECK1-SAME: () #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK1-NEXT: store i8* null, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK1: .await.work: +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK1-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK1-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK1-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK1: .select.workers: +// CHECK1-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK1-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK1: .execute.parallel: +// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK1-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK1: .terminate.parallel: +// CHECK1-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK1-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK1: .barrier.parallel: +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void // // -// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func20 -// CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK4-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 -// CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK4-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.4* -// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK4-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK4-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 0 -// CHECK4-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP5]] -// CHECK4-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* -// CHECK4-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 -// CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 -// CHECK4-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 1 -// CHECK4-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP5]] -// CHECK4-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8* -// CHECK4-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 4 -// CHECK4-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK4-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK4-NEXT: call void @"_omp$reduction$reduction_func16"(i8* [[TMP12]], i8* [[TMP13]]) #[[ATTR3]] -// CHECK4-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l27 +// CHECK1-SAME: (i64 [[C:%.*]], i64 [[D:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[D_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store i64 [[C]], i64* [[C_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[D]], i64* [[D_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[C_ADDR]] to i8* +// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[D_ADDR]] to float* +// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG8]] +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG9]] +// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10]] +// CHECK1-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK1-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK1-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK1: .worker: +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l27_worker() #[[ATTR3]] +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .mastercheck: +// CHECK1-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG8]] +// CHECK1-NEXT: [[NVPTX_NUM_THREADS3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG9]] +// CHECK1-NEXT: [[NVPTX_WARP_SIZE4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10]] +// CHECK1-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE4]], 1 +// CHECK1-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS3]], 1 +// CHECK1-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK1-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK1-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID2]], [[MASTER_TID]] +// CHECK1-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK1: .master: +// CHECK1-NEXT: [[NVPTX_NUM_THREADS5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG9]] +// CHECK1-NEXT: [[NVPTX_WARP_SIZE6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10]] +// CHECK1-NEXT: [[THREAD_LIMIT7:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS5]], [[NVPTX_WARP_SIZE6]] +// CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT7]], i16 1) +// CHECK1-NEXT: [[TMP5:%.*]] = load i8, i8* [[CONV]], align 8 +// CHECK1-NEXT: [[C8:%.*]] = call i8* @__kmpc_alloc_shared(i64 1) +// CHECK1-NEXT: store i8 [[TMP5]], i8* [[C8]], align 1 +// CHECK1-NEXT: [[TMP6:%.*]] = load float, float* [[CONV1]], align 8 +// CHECK1-NEXT: [[D9:%.*]] = call i8* @__kmpc_alloc_shared(i64 4) +// CHECK1-NEXT: [[D_ON_STACK:%.*]] = bitcast i8* [[D9]] to float* +// CHECK1-NEXT: store float [[TMP6]], float* [[D_ON_STACK]], align 4 +// CHECK1-NEXT: [[TMP7:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK1-NEXT: store i32 [[TMP7]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK1-NEXT: call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8* [[C8]], float* [[D_ON_STACK]]) #[[ATTR3]] +// CHECK1-NEXT: call void @__kmpc_free_shared(i8* [[D9]]) +// CHECK1-NEXT: call void @__kmpc_free_shared(i8* [[C8]]) +// CHECK1-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK1: .termination.notifier: +// CHECK1-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: br label [[DOTEXIT]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void // // -// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func21 -// CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK4-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK4-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK4-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.4* -// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK4-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK4-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 -// CHECK4-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* -// CHECK4-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 0 -// CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP7]] -// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 128 -// CHECK4-NEXT: store i32 [[TMP12]], i32* [[TMP10]], align 4 -// CHECK4-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK4-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 -// CHECK4-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16* -// CHECK4-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 1 -// CHECK4-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP7]] -// CHECK4-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP16]], align 128 -// CHECK4-NEXT: store i16 [[TMP17]], i16* [[TMP15]], align 2 -// CHECK4-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8* nonnull align 1 dereferenceable(1) [[C:%.*]], float* nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[D_ADDR:%.*]] = alloca float*, align 8 +// CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i8* [[C]], i8** [[C_ADDR]], align 8 +// CHECK1-NEXT: store float* [[D]], float** [[D_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load i8*, i8** [[C_ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load float*, float** [[D_ADDR]], align 8 +// CHECK1-NEXT: [[C1:%.*]] = call i8* @__kmpc_alloc_shared(i64 1) +// CHECK1-NEXT: [[D2:%.*]] = call i8* @__kmpc_alloc_shared(i64 4) +// CHECK1-NEXT: [[D_ON_STACK:%.*]] = bitcast i8* [[D2]] to float* +// CHECK1-NEXT: store i8 0, i8* [[C1]], align 1 +// CHECK1-NEXT: store float 1.000000e+00, float* [[D_ON_STACK]], align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = load i8, i8* [[C1]], align 1 +// CHECK1-NEXT: [[CONV:%.*]] = sext i8 [[TMP2]] to i32 +// CHECK1-NEXT: [[XOR:%.*]] = xor i32 [[CONV]], 2 +// CHECK1-NEXT: [[CONV3:%.*]] = trunc i32 [[XOR]] to i8 +// CHECK1-NEXT: store i8 [[CONV3]], i8* [[C1]], align 1 +// CHECK1-NEXT: [[TMP3:%.*]] = load float, float* [[D_ON_STACK]], align 4 +// CHECK1-NEXT: [[MUL:%.*]] = fmul float [[TMP3]], 3.300000e+01 +// CHECK1-NEXT: store float [[MUL]], float* [[D_ON_STACK]], align 4 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4 +// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 +// CHECK1-NEXT: store i8* [[C1]], i8** [[TMP6]], align 8 +// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP8:%.*]] = bitcast float* [[D_ON_STACK]] to i8* +// CHECK1-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 8 +// CHECK1-NEXT: [[TMP9:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK1-NEXT: [[TMP10:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 8 +// CHECK1-NEXT: [[TMP11:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i8* [[TMP10]], i32 1024, i8* [[TMP9]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func3, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func4, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func5, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func6, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func7, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func8) +// CHECK1-NEXT: [[TMP12:%.*]] = icmp eq i32 [[TMP11]], 1 +// CHECK1-NEXT: br i1 [[TMP12]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK1: .omp.reduction.then: +// CHECK1-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP0]], align 1 +// CHECK1-NEXT: [[CONV4:%.*]] = sext i8 [[TMP13]] to i32 +// CHECK1-NEXT: [[TMP14:%.*]] = load i8, i8* [[C1]], align 1 +// CHECK1-NEXT: [[CONV5:%.*]] = sext i8 [[TMP14]] to i32 +// CHECK1-NEXT: [[XOR6:%.*]] = xor i32 [[CONV4]], [[CONV5]] +// CHECK1-NEXT: [[CONV7:%.*]] = trunc i32 [[XOR6]] to i8 +// CHECK1-NEXT: store i8 [[CONV7]], i8* [[TMP0]], align 1 +// CHECK1-NEXT: [[TMP15:%.*]] = load float, float* [[TMP1]], align 4 +// CHECK1-NEXT: [[TMP16:%.*]] = load float, float* [[D_ON_STACK]], align 4 +// CHECK1-NEXT: [[MUL8:%.*]] = fmul float [[TMP15]], [[TMP16]] +// CHECK1-NEXT: store float [[MUL8]], float* [[TMP1]], align 4 +// CHECK1-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]]) +// CHECK1-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK1: .omp.reduction.done: +// CHECK1-NEXT: call void @__kmpc_free_shared(i8* [[D2]]) +// CHECK1-NEXT: call void @__kmpc_free_shared(i8* [[C1]]) +// CHECK1-NEXT: ret void // // -// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func22 -// CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK4-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 -// CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK4-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.4* -// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK4-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK4-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 0 -// CHECK4-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP5]] -// CHECK4-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* -// CHECK4-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 -// CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 -// CHECK4-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 1 -// CHECK4-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP5]] -// CHECK4-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8* -// CHECK4-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 4 -// CHECK4-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK4-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK4-NEXT: call void @"_omp$reduction$reduction_func16"(i8* [[TMP13]], i8* [[TMP12]]) #[[ATTR3]] -// CHECK4-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func3 +// CHECK1-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 8 +// CHECK1-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i8, align 1 +// CHECK1-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca float, align 4 +// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK1-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK1-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK1-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* +// CHECK1-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK1-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK1-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 8 +// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[TMP10]], i64 1 +// CHECK1-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP10]], align 1 +// CHECK1-NEXT: [[TMP14:%.*]] = sext i8 [[TMP13]] to i32 +// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10]] +// CHECK1-NEXT: [[TMP15:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK1-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP14]], i16 [[TMP7]], i16 [[TMP15]]) +// CHECK1-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8 +// CHECK1-NEXT: store i8 [[TMP17]], i8* [[DOTOMP_REDUCTION_ELEMENT]], align 1 +// CHECK1-NEXT: [[TMP18:%.*]] = getelementptr i8, i8* [[TMP10]], i64 1 +// CHECK1-NEXT: [[TMP19:%.*]] = getelementptr i8, i8* [[DOTOMP_REDUCTION_ELEMENT]], i64 1 +// CHECK1-NEXT: store i8* [[DOTOMP_REDUCTION_ELEMENT]], i8** [[TMP11]], align 8 +// CHECK1-NEXT: [[TMP20:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP21:%.*]] = load i8*, i8** [[TMP20]], align 8 +// CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP23:%.*]] = bitcast i8* [[TMP21]] to float* +// CHECK1-NEXT: [[TMP24:%.*]] = getelementptr float, float* [[TMP23]], i64 1 +// CHECK1-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to i8* +// CHECK1-NEXT: [[TMP26:%.*]] = bitcast float* [[TMP23]] to i32* +// CHECK1-NEXT: [[TMP27:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i32* +// CHECK1-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10]] +// CHECK1-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 +// CHECK1-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) +// CHECK1-NEXT: store i32 [[TMP30]], i32* [[TMP27]], align 4 +// CHECK1-NEXT: [[TMP31:%.*]] = getelementptr i32, i32* [[TMP26]], i64 1 +// CHECK1-NEXT: [[TMP32:%.*]] = getelementptr i32, i32* [[TMP27]], i64 1 +// CHECK1-NEXT: [[TMP33:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* +// CHECK1-NEXT: store i8* [[TMP33]], i8** [[TMP22]], align 8 +// CHECK1-NEXT: [[TMP34:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK1-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK1-NEXT: [[TMP36:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK1-NEXT: [[TMP37:%.*]] = and i1 [[TMP35]], [[TMP36]] +// CHECK1-NEXT: [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK1-NEXT: [[TMP39:%.*]] = and i16 [[TMP6]], 1 +// CHECK1-NEXT: [[TMP40:%.*]] = icmp eq i16 [[TMP39]], 0 +// CHECK1-NEXT: [[TMP41:%.*]] = and i1 [[TMP38]], [[TMP40]] +// CHECK1-NEXT: [[TMP42:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK1-NEXT: [[TMP43:%.*]] = and i1 [[TMP41]], [[TMP42]] +// CHECK1-NEXT: [[TMP44:%.*]] = or i1 [[TMP34]], [[TMP37]] +// CHECK1-NEXT: [[TMP45:%.*]] = or i1 [[TMP44]], [[TMP43]] +// CHECK1-NEXT: br i1 [[TMP45]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK1: then: +// CHECK1-NEXT: [[TMP46:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* +// CHECK1-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK1-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP46]], i8* [[TMP47]]) #[[ATTR3]] +// CHECK1-NEXT: br label [[IFCONT:%.*]] +// CHECK1: else: +// CHECK1-NEXT: br label [[IFCONT]] +// CHECK1: ifcont: +// CHECK1-NEXT: [[TMP48:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK1-NEXT: [[TMP49:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK1-NEXT: [[TMP50:%.*]] = and i1 [[TMP48]], [[TMP49]] +// CHECK1-NEXT: br i1 [[TMP50]], label [[THEN6:%.*]], label [[ELSE7:%.*]] +// CHECK1: then6: +// CHECK1-NEXT: [[TMP51:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP52:%.*]] = load i8*, i8** [[TMP51]], align 8 +// CHECK1-NEXT: [[TMP53:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP54:%.*]] = load i8*, i8** [[TMP53]], align 8 +// CHECK1-NEXT: [[TMP55:%.*]] = load i8, i8* [[TMP52]], align 1 +// CHECK1-NEXT: store i8 [[TMP55]], i8* [[TMP54]], align 1 +// CHECK1-NEXT: [[TMP56:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP57:%.*]] = load i8*, i8** [[TMP56]], align 8 +// CHECK1-NEXT: [[TMP58:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP59:%.*]] = load i8*, i8** [[TMP58]], align 8 +// CHECK1-NEXT: [[TMP60:%.*]] = bitcast i8* [[TMP57]] to float* +// CHECK1-NEXT: [[TMP61:%.*]] = bitcast i8* [[TMP59]] to float* +// CHECK1-NEXT: [[TMP62:%.*]] = load float, float* [[TMP60]], align 4 +// CHECK1-NEXT: store float [[TMP62]], float* [[TMP61]], align 4 +// CHECK1-NEXT: br label [[IFCONT8:%.*]] +// CHECK1: else7: +// CHECK1-NEXT: br label [[IFCONT8]] +// CHECK1: ifcont8: +// CHECK1-NEXT: ret void +// // +// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func4 +// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG8]] +// CHECK1-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG8]] +// CHECK1-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK1-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG8]] +// CHECK1-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK1-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK1-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK1: then: +// CHECK1-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 8 +// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK1-NEXT: [[TMP8:%.*]] = bitcast i32 addrspace(3)* [[TMP7]] to i8 addrspace(3)* +// CHECK1-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP6]], align 1 +// CHECK1-NEXT: store volatile i8 [[TMP9]], i8 addrspace(3)* [[TMP8]], align 1 +// CHECK1-NEXT: br label [[IFCONT:%.*]] +// CHECK1: else: +// CHECK1-NEXT: br label [[IFCONT]] +// CHECK1: ifcont: +// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] +// CHECK1-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK1: then4: +// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK1-NEXT: [[TMP12:%.*]] = bitcast i32 addrspace(3)* [[TMP11]] to i8 addrspace(3)* +// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 8 +// CHECK1-NEXT: [[TMP15:%.*]] = load volatile i8, i8 addrspace(3)* [[TMP12]], align 1 +// CHECK1-NEXT: store i8 [[TMP15]], i8* [[TMP14]], align 1 +// CHECK1-NEXT: br label [[IFCONT6:%.*]] +// CHECK1: else5: +// CHECK1-NEXT: br label [[IFCONT6]] +// CHECK1: ifcont6: +// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK1-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK1-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] +// CHECK1: then8: +// CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 8 +// CHECK1-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i32* +// CHECK1-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK1-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP18]], align 4 +// CHECK1-NEXT: store volatile i32 [[TMP20]], i32 addrspace(3)* [[TMP19]], align 4 +// CHECK1-NEXT: br label [[IFCONT10:%.*]] +// CHECK1: else9: +// CHECK1-NEXT: br label [[IFCONT10]] +// CHECK1: ifcont10: +// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK1-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP21]] +// CHECK1-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] +// CHECK1: then12: +// CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP24:%.*]] = load i8*, i8** [[TMP23]], align 8 +// CHECK1-NEXT: [[TMP25:%.*]] = bitcast i8* [[TMP24]] to i32* +// CHECK1-NEXT: [[TMP26:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP22]], align 4 +// CHECK1-NEXT: store i32 [[TMP26]], i32* [[TMP25]], align 4 +// CHECK1-NEXT: br label [[IFCONT14:%.*]] +// CHECK1: else13: +// CHECK1-NEXT: br label [[IFCONT14]] +// CHECK1: ifcont14: +// CHECK1-NEXT: ret void // -// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_worker -// CHECK5-SAME: () #[[ATTR0:[0-9]+]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 -// CHECK5-NEXT: store i8* null, i8** [[WORK_FN]], align 4 -// CHECK5-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 -// CHECK5-NEXT: br label [[DOTAWAIT_WORK:%.*]] -// CHECK5: .await.work: -// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK5-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) -// CHECK5-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 -// CHECK5-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 -// CHECK5-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 -// CHECK5-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null -// CHECK5-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] -// CHECK5: .select.workers: -// CHECK5-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 -// CHECK5-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK5-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] -// CHECK5: .execute.parallel: -// CHECK5-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) -// CHECK5-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* -// CHECK5-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) -// CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] -// CHECK5: .terminate.parallel: -// CHECK5-NEXT: call void @__kmpc_kernel_end_parallel() -// CHECK5-NEXT: br label [[DOTBARRIER_PARALLEL]] -// CHECK5: .barrier.parallel: -// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK5-NEXT: br label [[DOTAWAIT_WORK]] -// CHECK5: .exit: -// CHECK5-NEXT: ret void // +// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func5 +// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0* +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8 +// CHECK1-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP7]] +// CHECK1-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]], align 1 +// CHECK1-NEXT: store i8 [[TMP11]], i8* [[TMP10]], align 128 +// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 8 +// CHECK1-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float* +// CHECK1-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 1 +// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP7]] +// CHECK1-NEXT: [[TMP16:%.*]] = load float, float* [[TMP14]], align 4 +// CHECK1-NEXT: store float [[TMP16]], float* [[TMP15]], align 128 +// CHECK1-NEXT: ret void // -// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23 -// CHECK5-SAME: (double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1:[0-9]+]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[E_ADDR:%.*]] = alloca double*, align 4 -// CHECK5-NEXT: [[E7:%.*]] = alloca double, align 8 -// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK5-NEXT: store double* [[E]], double** [[E_ADDR]], align 4 -// CHECK5-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4 -// CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK5-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK5-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] -// CHECK5-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] -// CHECK5-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] -// CHECK5: .worker: -// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_worker() #[[ATTR3:[0-9]+]] -// CHECK5-NEXT: br label [[DOTEXIT:%.*]] -// CHECK5: .mastercheck: -// CHECK5-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK5-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK5-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK5-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 -// CHECK5-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 -// CHECK5-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1 -// CHECK5-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]] -// CHECK5-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] -// CHECK5-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] -// CHECK5: .master: -// CHECK5-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK5-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK5-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] -// CHECK5-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK5-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK5-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK5-NEXT: [[TMP7:%.*]] = load double, double* [[TMP0]], align 8 -// CHECK5-NEXT: store double [[TMP7]], double* [[E7]], align 8 -// CHECK5-NEXT: store i32 [[TMP6]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK5-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], double* [[E7]]) #[[ATTR3]] -// CHECK5-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] -// CHECK5: .termination.notifier: -// CHECK5-NEXT: call void @__kmpc_kernel_deinit(i16 1) -// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK5-NEXT: br label [[DOTEXIT]] -// CHECK5: .exit: -// CHECK5-NEXT: ret void +// +// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func6 +// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8 +// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0* +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 +// CHECK1-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP5]] +// CHECK1-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 8 +// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1 +// CHECK1-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 1 +// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP5]] +// CHECK1-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8* +// CHECK1-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 8 +// CHECK1-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK1-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 +// CHECK1-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP11]], i8* [[TMP12]]) #[[ATTR3]] +// CHECK1-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[E_ADDR:%.*]] = alloca double*, align 4 -// CHECK5-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 -// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK5-NEXT: store double* [[E]], double** [[E_ADDR]], align 4 -// CHECK5-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4 -// CHECK5-NEXT: [[TMP1:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 8, i16 1) -// CHECK5-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to %struct._globalized_locals_ty* -// CHECK5-NEXT: [[E1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP2]], i32 0, i32 0 -// CHECK5-NEXT: store double 0.000000e+00, double* [[E1]], align 8 -// CHECK5-NEXT: [[TMP3:%.*]] = load double, double* [[E1]], align 8 -// CHECK5-NEXT: [[ADD:%.*]] = fadd double [[TMP3]], 5.000000e+00 -// CHECK5-NEXT: store double [[ADD]], double* [[E1]], align 8 -// CHECK5-NEXT: [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4 -// CHECK5-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP7:%.*]] = bitcast double* [[E1]] to i8* -// CHECK5-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 -// CHECK5-NEXT: [[TMP8:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK5-NEXT: [[TMP9:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 -// CHECK5-NEXT: [[TMP10:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i8* [[TMP9]], i32 1024, i8* [[TMP8]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func) -// CHECK5-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP10]], 1 -// CHECK5-NEXT: br i1 [[TMP11]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] -// CHECK5: .omp.reduction.then: -// CHECK5-NEXT: [[TMP12:%.*]] = load double, double* [[TMP0]], align 8 -// CHECK5-NEXT: [[TMP13:%.*]] = load double, double* [[E1]], align 8 -// CHECK5-NEXT: [[ADD2:%.*]] = fadd double [[TMP12]], [[TMP13]] -// CHECK5-NEXT: store double [[ADD2]], double* [[TMP0]], align 8 -// CHECK5-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]]) -// CHECK5-NEXT: br label [[DOTOMP_REDUCTION_DONE]] -// CHECK5: .omp.reduction.done: -// CHECK5-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP1]]) -// CHECK5-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func7 +// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0* +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8 +// CHECK1-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP7]] +// CHECK1-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP10]], align 128 +// CHECK1-NEXT: store i8 [[TMP11]], i8* [[TMP9]], align 1 +// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 8 +// CHECK1-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float* +// CHECK1-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 1 +// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP7]] +// CHECK1-NEXT: [[TMP16:%.*]] = load float, float* [[TMP15]], align 128 +// CHECK1-NEXT: store float [[TMP16]], float* [[TMP14]], align 4 +// CHECK1-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func -// CHECK5-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 -// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 -// CHECK5-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 -// CHECK5-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x i8*], align 4 -// CHECK5-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca double, align 8 -// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 -// CHECK5-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 -// CHECK5-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 -// CHECK5-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [1 x i8*]* -// CHECK5-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 -// CHECK5-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 -// CHECK5-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 -// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 -// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to double* -// CHECK5-NEXT: [[TMP13:%.*]] = getelementptr double, double* [[TMP12]], i32 1 -// CHECK5-NEXT: [[TMP14:%.*]] = bitcast double* [[TMP13]] to i8* -// CHECK5-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP12]] to i64* -// CHECK5-NEXT: [[TMP16:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i64* -// CHECK5-NEXT: [[TMP17:%.*]] = load i64, i64* [[TMP15]], align 8 -// CHECK5-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK5-NEXT: [[TMP18:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 -// CHECK5-NEXT: [[TMP19:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP17]], i16 [[TMP7]], i16 [[TMP18]]) -// CHECK5-NEXT: store i64 [[TMP19]], i64* [[TMP16]], align 8 -// CHECK5-NEXT: [[TMP20:%.*]] = getelementptr i64, i64* [[TMP15]], i32 1 -// CHECK5-NEXT: [[TMP21:%.*]] = getelementptr i64, i64* [[TMP16]], i32 1 -// CHECK5-NEXT: [[TMP22:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i8* -// CHECK5-NEXT: store i8* [[TMP22]], i8** [[TMP11]], align 4 -// CHECK5-NEXT: [[TMP23:%.*]] = icmp eq i16 [[TMP8]], 0 -// CHECK5-NEXT: [[TMP24:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK5-NEXT: [[TMP25:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] -// CHECK5-NEXT: [[TMP26:%.*]] = and i1 [[TMP24]], [[TMP25]] -// CHECK5-NEXT: [[TMP27:%.*]] = icmp eq i16 [[TMP8]], 2 -// CHECK5-NEXT: [[TMP28:%.*]] = and i16 [[TMP6]], 1 -// CHECK5-NEXT: [[TMP29:%.*]] = icmp eq i16 [[TMP28]], 0 -// CHECK5-NEXT: [[TMP30:%.*]] = and i1 [[TMP27]], [[TMP29]] -// CHECK5-NEXT: [[TMP31:%.*]] = icmp sgt i16 [[TMP7]], 0 -// CHECK5-NEXT: [[TMP32:%.*]] = and i1 [[TMP30]], [[TMP31]] -// CHECK5-NEXT: [[TMP33:%.*]] = or i1 [[TMP23]], [[TMP26]] -// CHECK5-NEXT: [[TMP34:%.*]] = or i1 [[TMP33]], [[TMP32]] -// CHECK5-NEXT: br i1 [[TMP34]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK5: then: -// CHECK5-NEXT: [[TMP35:%.*]] = bitcast [1 x i8*]* [[TMP5]] to i8* -// CHECK5-NEXT: [[TMP36:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* -// CHECK5-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP35]], i8* [[TMP36]]) #[[ATTR3]] -// CHECK5-NEXT: br label [[IFCONT:%.*]] -// CHECK5: else: -// CHECK5-NEXT: br label [[IFCONT]] -// CHECK5: ifcont: -// CHECK5-NEXT: [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK5-NEXT: [[TMP38:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] -// CHECK5-NEXT: [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]] -// CHECK5-NEXT: br i1 [[TMP39]], label [[THEN4:%.*]], label [[ELSE5:%.*]] -// CHECK5: then4: -// CHECK5-NEXT: [[TMP40:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP41:%.*]] = load i8*, i8** [[TMP40]], align 4 -// CHECK5-NEXT: [[TMP42:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP43:%.*]] = load i8*, i8** [[TMP42]], align 4 -// CHECK5-NEXT: [[TMP44:%.*]] = bitcast i8* [[TMP41]] to double* -// CHECK5-NEXT: [[TMP45:%.*]] = bitcast i8* [[TMP43]] to double* -// CHECK5-NEXT: [[TMP46:%.*]] = load double, double* [[TMP44]], align 8 -// CHECK5-NEXT: store double [[TMP46]], double* [[TMP45]], align 8 -// CHECK5-NEXT: br label [[IFCONT6:%.*]] -// CHECK5: else5: -// CHECK5-NEXT: br label [[IFCONT6]] -// CHECK5: ifcont6: -// CHECK5-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func8 +// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8 +// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0* +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 +// CHECK1-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP5]] +// CHECK1-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 8 +// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1 +// CHECK1-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 1 +// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP5]] +// CHECK1-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8* +// CHECK1-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 8 +// CHECK1-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK1-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 +// CHECK1-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP12]], i8* [[TMP11]]) #[[ATTR3]] +// CHECK1-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func -// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTCNT_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK5-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK5-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 -// CHECK5-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK5-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 -// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* -// CHECK5-NEXT: store i32 0, i32* [[DOTCNT_ADDR]], align 4 -// CHECK5-NEXT: br label [[PRECOND:%.*]] -// CHECK5: precond: -// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCNT_ADDR]], align 4 -// CHECK5-NEXT: [[TMP6:%.*]] = icmp ult i32 [[TMP5]], 2 -// CHECK5-NEXT: br i1 [[TMP6]], label [[BODY:%.*]], label [[EXIT:%.*]] -// CHECK5: body: -// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP2]]) -// CHECK5-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK5-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK5: then: -// CHECK5-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP8:%.*]] = load i8*, i8** [[TMP7]], align 4 -// CHECK5-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32* -// CHECK5-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[TMP9]], i32 [[TMP5]] -// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 -// CHECK5-NEXT: store volatile i32 [[TMP12]], i32 addrspace(3)* [[TMP11]], align 4 -// CHECK5-NEXT: br label [[IFCONT:%.*]] -// CHECK5: else: -// CHECK5-NEXT: br label [[IFCONT]] -// CHECK5: ifcont: -// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) -// CHECK5-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP13]] -// CHECK5-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] -// CHECK5: then4: -// CHECK5-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK5-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 4 -// CHECK5-NEXT: [[TMP17:%.*]] = bitcast i8* [[TMP16]] to i32* -// CHECK5-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP17]], i32 [[TMP5]] -// CHECK5-NEXT: [[TMP19:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4 -// CHECK5-NEXT: store i32 [[TMP19]], i32* [[TMP18]], align 4 -// CHECK5-NEXT: br label [[IFCONT6:%.*]] -// CHECK5: else5: -// CHECK5-NEXT: br label [[IFCONT6]] -// CHECK5: ifcont6: -// CHECK5-NEXT: [[TMP20:%.*]] = add nsw i32 [[TMP5]], 1 -// CHECK5-NEXT: store i32 [[TMP20]], i32* [[DOTCNT_ADDR]], align 4 -// CHECK5-NEXT: br label [[PRECOND]] -// CHECK5: exit: -// CHECK5-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l34 +// CHECK1-SAME: (i64 [[A:%.*]], i64 [[B:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[B]], i64* [[B_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32* +// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[B_ADDR]] to i16* +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG9]] +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) +// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK1: .execute: +// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK1-NEXT: store i32 [[TMP0]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK1-NEXT: call void @__omp_outlined__9(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[CONV]], i16* [[CONV1]]) #[[ATTR3]] +// CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK1: .omp.deinit: +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func -// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* -// CHECK5-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0* -// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 -// CHECK5-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double* -// CHECK5-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP7]] -// CHECK5-NEXT: [[TMP12:%.*]] = load double, double* [[TMP10]], align 8 -// CHECK5-NEXT: store double [[TMP12]], double* [[TMP11]], align 128 -// CHECK5-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__9 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 8 +// CHECK1-NEXT: [[A1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[B2:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 8 +// CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK1-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 8 +// CHECK1-NEXT: store i32 0, i32* [[A1]], align 4 +// CHECK1-NEXT: store i16 -32768, i16* [[B2]], align 2 +// CHECK1-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP3:%.*]] = bitcast i32* [[A1]] to i8* +// CHECK1-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i16* [[B2]] to i8* +// CHECK1-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 8 +// CHECK1-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 +// CHECK1-NEXT: [[TMP8:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*)* @__omp_outlined__10 to i8*), i8* null, i8** [[TMP8]], i64 2) +// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP10:%.*]] = bitcast i32* [[A1]] to i8* +// CHECK1-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 8 +// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP12:%.*]] = bitcast i16* [[B2]] to i8* +// CHECK1-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 8 +// CHECK1-NEXT: [[TMP13:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK1-NEXT: [[TMP14:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 8 +// CHECK1-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i8* [[TMP14]], i32 1024, i8* [[TMP13]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func15, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func16, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func17, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func18, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func19, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func20) +// CHECK1-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP15]], 1 +// CHECK1-NEXT: br i1 [[TMP16]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK1: .omp.reduction.then: +// CHECK1-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[A1]], align 4 +// CHECK1-NEXT: [[OR:%.*]] = or i32 [[TMP17]], [[TMP18]] +// CHECK1-NEXT: store i32 [[OR]], i32* [[TMP0]], align 4 +// CHECK1-NEXT: [[TMP19:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK1-NEXT: [[CONV:%.*]] = sext i16 [[TMP19]] to i32 +// CHECK1-NEXT: [[TMP20:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK1-NEXT: [[CONV3:%.*]] = sext i16 [[TMP20]] to i32 +// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[CONV]], [[CONV3]] +// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK1: cond.true: +// CHECK1-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK1-NEXT: br label [[COND_END:%.*]] +// CHECK1: cond.false: +// CHECK1-NEXT: [[TMP22:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK1-NEXT: br label [[COND_END]] +// CHECK1: cond.end: +// CHECK1-NEXT: [[COND:%.*]] = phi i16 [ [[TMP21]], [[COND_TRUE]] ], [ [[TMP22]], [[COND_FALSE]] ] +// CHECK1-NEXT: store i16 [[COND]], i16* [[TMP1]], align 2 +// CHECK1-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP7]]) +// CHECK1-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK1: .omp.reduction.done: +// CHECK1-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func -// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 -// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0* -// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK5-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP5]] -// CHECK5-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8* -// CHECK5-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 -// CHECK5-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK5-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK5-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP9]], i8* [[TMP10]]) #[[ATTR3]] -// CHECK5-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__10 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 8 +// CHECK1-NEXT: [[A1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[B2:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK1-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 8 +// CHECK1-NEXT: store i32 0, i32* [[A1]], align 4 +// CHECK1-NEXT: store i16 -32768, i16* [[B2]], align 2 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[A1]], align 4 +// CHECK1-NEXT: [[OR:%.*]] = or i32 [[TMP2]], 1 +// CHECK1-NEXT: store i32 [[OR]], i32* [[A1]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK1-NEXT: [[CONV:%.*]] = sext i16 [[TMP3]] to i32 +// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 99, [[CONV]] +// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK1: cond.true: +// CHECK1-NEXT: br label [[COND_END:%.*]] +// CHECK1: cond.false: +// CHECK1-NEXT: [[TMP4:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK1-NEXT: [[CONV3:%.*]] = sext i16 [[TMP4]] to i32 +// CHECK1-NEXT: br label [[COND_END]] +// CHECK1: cond.end: +// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[CONV3]], [[COND_FALSE]] ] +// CHECK1-NEXT: [[CONV4:%.*]] = trunc i32 [[COND]] to i16 +// CHECK1-NEXT: store i16 [[CONV4]], i16* [[B2]], align 2 +// CHECK1-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP8:%.*]] = bitcast i32* [[A1]] to i8* +// CHECK1-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 8 +// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP10:%.*]] = bitcast i16* [[B2]] to i8* +// CHECK1-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 8 +// CHECK1-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK1-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP6]], i32 2, i64 16, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func12, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func13) +// CHECK1-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP12]], 1 +// CHECK1-NEXT: br i1 [[TMP13]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK1: .omp.reduction.then: +// CHECK1-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[A1]], align 4 +// CHECK1-NEXT: [[OR5:%.*]] = or i32 [[TMP14]], [[TMP15]] +// CHECK1-NEXT: store i32 [[OR5]], i32* [[TMP0]], align 4 +// CHECK1-NEXT: [[TMP16:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK1-NEXT: [[CONV6:%.*]] = sext i16 [[TMP16]] to i32 +// CHECK1-NEXT: [[TMP17:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK1-NEXT: [[CONV7:%.*]] = sext i16 [[TMP17]] to i32 +// CHECK1-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[CONV6]], [[CONV7]] +// CHECK1-NEXT: br i1 [[CMP8]], label [[COND_TRUE9:%.*]], label [[COND_FALSE10:%.*]] +// CHECK1: cond.true9: +// CHECK1-NEXT: [[TMP18:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK1-NEXT: br label [[COND_END11:%.*]] +// CHECK1: cond.false10: +// CHECK1-NEXT: [[TMP19:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK1-NEXT: br label [[COND_END11]] +// CHECK1: cond.end11: +// CHECK1-NEXT: [[COND12:%.*]] = phi i16 [ [[TMP18]], [[COND_TRUE9]] ], [ [[TMP19]], [[COND_FALSE10]] ] +// CHECK1-NEXT: store i16 [[COND12]], i16* [[TMP1]], align 2 +// CHECK1-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP6]]) +// CHECK1-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK1: .omp.reduction.done: +// CHECK1-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func -// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* -// CHECK5-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0* -// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 -// CHECK5-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double* -// CHECK5-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP7]] -// CHECK5-NEXT: [[TMP12:%.*]] = load double, double* [[TMP11]], align 128 -// CHECK5-NEXT: store double [[TMP12]], double* [[TMP10]], align 8 -// CHECK5-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func12 +// CHECK1-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 8 +// CHECK1-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK1-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK1-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK1-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* +// CHECK1-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK1-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK1-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 8 +// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* +// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i64 1 +// CHECK1-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* +// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 +// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10]] +// CHECK1-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK1-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]]) +// CHECK1-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 +// CHECK1-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i64 1 +// CHECK1-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i64 1 +// CHECK1-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* +// CHECK1-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 8 +// CHECK1-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 8 +// CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16* +// CHECK1-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i64 1 +// CHECK1-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8* +// CHECK1-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2 +// CHECK1-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32 +// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10]] +// CHECK1-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 +// CHECK1-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) +// CHECK1-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 +// CHECK1-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 +// CHECK1-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i64 1 +// CHECK1-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i64 1 +// CHECK1-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* +// CHECK1-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 8 +// CHECK1-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK1-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK1-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK1-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]] +// CHECK1-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK1-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1 +// CHECK1-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0 +// CHECK1-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]] +// CHECK1-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK1-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]] +// CHECK1-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]] +// CHECK1-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]] +// CHECK1-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK1: then: +// CHECK1-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* +// CHECK1-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK1-NEXT: call void @"_omp$reduction$reduction_func11"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]] +// CHECK1-NEXT: br label [[IFCONT:%.*]] +// CHECK1: else: +// CHECK1-NEXT: br label [[IFCONT]] +// CHECK1: ifcont: +// CHECK1-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK1-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK1-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]] +// CHECK1-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]] +// CHECK1: then6: +// CHECK1-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 8 +// CHECK1-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 8 +// CHECK1-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32* +// CHECK1-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32* +// CHECK1-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4 +// CHECK1-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4 +// CHECK1-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 8 +// CHECK1-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 8 +// CHECK1-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16* +// CHECK1-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16* +// CHECK1-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2 +// CHECK1-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2 +// CHECK1-NEXT: br label [[IFCONT8:%.*]] +// CHECK1: else7: +// CHECK1-NEXT: br label [[IFCONT8]] +// CHECK1: ifcont8: +// CHECK1-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func -// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 -// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0* -// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK5-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP5]] -// CHECK5-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8* -// CHECK5-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 -// CHECK5-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK5-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK5-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP10]], i8* [[TMP9]]) #[[ATTR3]] -// CHECK5-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func13 +// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG8]] +// CHECK1-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG8]] +// CHECK1-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK1-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG8]] +// CHECK1-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4:[0-9]+]], i32 [[TMP2]]) +// CHECK1-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK1-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK1: then: +// CHECK1-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 8 +// CHECK1-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* +// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK1-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4 +// CHECK1-NEXT: br label [[IFCONT:%.*]] +// CHECK1: else: +// CHECK1-NEXT: br label [[IFCONT]] +// CHECK1: ifcont: +// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] +// CHECK1-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK1: then4: +// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 8 +// CHECK1-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32* +// CHECK1-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4 +// CHECK1-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4 +// CHECK1-NEXT: br label [[IFCONT6:%.*]] +// CHECK1: else5: +// CHECK1-NEXT: br label [[IFCONT6]] +// CHECK1: ifcont6: +// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK1-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK1-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] +// CHECK1: then8: +// CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 8 +// CHECK1-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16* +// CHECK1-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK1-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)* +// CHECK1-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2 +// CHECK1-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2 +// CHECK1-NEXT: br label [[IFCONT10:%.*]] +// CHECK1: else9: +// CHECK1-NEXT: br label [[IFCONT10]] +// CHECK1: ifcont10: +// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK1-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]] +// CHECK1-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] +// CHECK1: then12: +// CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK1-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)* +// CHECK1-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 8 +// CHECK1-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16* +// CHECK1-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2 +// CHECK1-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2 +// CHECK1-NEXT: br label [[IFCONT14:%.*]] +// CHECK1: else13: +// CHECK1-NEXT: br label [[IFCONT14]] +// CHECK1: ifcont14: +// CHECK1-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29_worker -// CHECK5-SAME: () #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 -// CHECK5-NEXT: store i8* null, i8** [[WORK_FN]], align 4 -// CHECK5-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 -// CHECK5-NEXT: br label [[DOTAWAIT_WORK:%.*]] -// CHECK5: .await.work: -// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK5-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) -// CHECK5-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 -// CHECK5-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 -// CHECK5-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 -// CHECK5-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null -// CHECK5-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] -// CHECK5: .select.workers: -// CHECK5-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 -// CHECK5-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK5-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] -// CHECK5: .execute.parallel: -// CHECK5-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK5-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* -// CHECK5-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) -// CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] -// CHECK5: .terminate.parallel: -// CHECK5-NEXT: call void @__kmpc_kernel_end_parallel() -// CHECK5-NEXT: br label [[DOTBARRIER_PARALLEL]] -// CHECK5: .barrier.parallel: -// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK5-NEXT: br label [[DOTAWAIT_WORK]] -// CHECK5: .exit: -// CHECK5-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func15 +// CHECK1-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 8 +// CHECK1-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK1-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK1-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK1-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* +// CHECK1-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK1-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK1-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 8 +// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* +// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i64 1 +// CHECK1-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* +// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 +// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10]] +// CHECK1-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK1-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]]) +// CHECK1-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 +// CHECK1-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i64 1 +// CHECK1-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i64 1 +// CHECK1-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* +// CHECK1-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 8 +// CHECK1-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 8 +// CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16* +// CHECK1-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i64 1 +// CHECK1-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8* +// CHECK1-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2 +// CHECK1-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32 +// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10]] +// CHECK1-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 +// CHECK1-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) +// CHECK1-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 +// CHECK1-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 +// CHECK1-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i64 1 +// CHECK1-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i64 1 +// CHECK1-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* +// CHECK1-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 8 +// CHECK1-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK1-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK1-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK1-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]] +// CHECK1-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK1-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1 +// CHECK1-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0 +// CHECK1-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]] +// CHECK1-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK1-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]] +// CHECK1-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]] +// CHECK1-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]] +// CHECK1-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK1: then: +// CHECK1-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* +// CHECK1-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK1-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]] +// CHECK1-NEXT: br label [[IFCONT:%.*]] +// CHECK1: else: +// CHECK1-NEXT: br label [[IFCONT]] +// CHECK1: ifcont: +// CHECK1-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK1-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK1-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]] +// CHECK1-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]] +// CHECK1: then6: +// CHECK1-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 8 +// CHECK1-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 8 +// CHECK1-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32* +// CHECK1-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32* +// CHECK1-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4 +// CHECK1-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4 +// CHECK1-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 8 +// CHECK1-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 8 +// CHECK1-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16* +// CHECK1-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16* +// CHECK1-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2 +// CHECK1-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2 +// CHECK1-NEXT: br label [[IFCONT8:%.*]] +// CHECK1: else7: +// CHECK1-NEXT: br label [[IFCONT8]] +// CHECK1: ifcont8: +// CHECK1-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29 -// CHECK5-SAME: (i32 [[C:%.*]], i32 [[D:%.*]]) #[[ATTR1]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[C_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[D_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[C]], i32* [[C_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[D]], i32* [[D_ADDR]], align 4 -// CHECK5-NEXT: [[CONV:%.*]] = bitcast i32* [[C_ADDR]] to i8* -// CHECK5-NEXT: [[CONV1:%.*]] = bitcast i32* [[D_ADDR]] to float* -// CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK5-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK5-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] -// CHECK5-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] -// CHECK5-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] -// CHECK5: .worker: -// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29_worker() #[[ATTR3]] -// CHECK5-NEXT: br label [[DOTEXIT:%.*]] -// CHECK5: .mastercheck: -// CHECK5-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK5-NEXT: [[NVPTX_NUM_THREADS3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK5-NEXT: [[NVPTX_WARP_SIZE4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK5-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE4]], 1 -// CHECK5-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS3]], 1 -// CHECK5-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 -// CHECK5-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] -// CHECK5-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID2]], [[MASTER_TID]] -// CHECK5-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] -// CHECK5: .master: -// CHECK5-NEXT: [[NVPTX_NUM_THREADS5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK5-NEXT: [[NVPTX_WARP_SIZE6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK5-NEXT: [[THREAD_LIMIT7:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS5]], [[NVPTX_WARP_SIZE6]] -// CHECK5-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT7]], i16 1) -// CHECK5-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK5-NEXT: [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 8, i16 1) -// CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.1* -// CHECK5-NEXT: [[TMP7:%.*]] = load i8, i8* [[CONV]], align 4 -// CHECK5-NEXT: [[C8:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 1 -// CHECK5-NEXT: store i8 [[TMP7]], i8* [[C8]], align 4 -// CHECK5-NEXT: [[TMP8:%.*]] = load float, float* [[CONV1]], align 4 -// CHECK5-NEXT: [[D9:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 0 -// CHECK5-NEXT: store float [[TMP8]], float* [[D9]], align 4 -// CHECK5-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK5-NEXT: store i32 [[TMP9]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK5-NEXT: call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8* [[C8]], float* [[D9]]) #[[ATTR3]] -// CHECK5-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]]) -// CHECK5-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] -// CHECK5: .termination.notifier: -// CHECK5-NEXT: call void @__kmpc_kernel_deinit(i16 1) -// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK5-NEXT: br label [[DOTEXIT]] -// CHECK5: .exit: -// CHECK5-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func16 +// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG8]] +// CHECK1-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG8]] +// CHECK1-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK1-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG8]] +// CHECK1-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK1-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK1-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK1: then: +// CHECK1-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 8 +// CHECK1-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* +// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK1-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4 +// CHECK1-NEXT: br label [[IFCONT:%.*]] +// CHECK1: else: +// CHECK1-NEXT: br label [[IFCONT]] +// CHECK1: ifcont: +// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] +// CHECK1-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK1: then4: +// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 8 +// CHECK1-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32* +// CHECK1-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4 +// CHECK1-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4 +// CHECK1-NEXT: br label [[IFCONT6:%.*]] +// CHECK1: else5: +// CHECK1-NEXT: br label [[IFCONT6]] +// CHECK1: ifcont6: +// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK1-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK1-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] +// CHECK1: then8: +// CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 8 +// CHECK1-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16* +// CHECK1-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK1-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)* +// CHECK1-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2 +// CHECK1-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2 +// CHECK1-NEXT: br label [[IFCONT10:%.*]] +// CHECK1: else9: +// CHECK1-NEXT: br label [[IFCONT10]] +// CHECK1: ifcont10: +// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK1-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]] +// CHECK1-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] +// CHECK1: then12: +// CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK1-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)* +// CHECK1-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 8 +// CHECK1-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16* +// CHECK1-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2 +// CHECK1-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2 +// CHECK1-NEXT: br label [[IFCONT14:%.*]] +// CHECK1: else13: +// CHECK1-NEXT: br label [[IFCONT14]] +// CHECK1: ifcont14: +// CHECK1-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8* nonnull align 1 dereferenceable(1) [[C:%.*]], float* nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[C_ADDR:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: [[D_ADDR:%.*]] = alloca float*, align 4 -// CHECK5-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 -// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK5-NEXT: store i8* [[C]], i8** [[C_ADDR]], align 4 -// CHECK5-NEXT: store float* [[D]], float** [[D_ADDR]], align 4 -// CHECK5-NEXT: [[TMP0:%.*]] = load i8*, i8** [[C_ADDR]], align 4 -// CHECK5-NEXT: [[TMP1:%.*]] = load float*, float** [[D_ADDR]], align 4 -// CHECK5-NEXT: [[TMP2:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 8, i16 1) -// CHECK5-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to %struct._globalized_locals_ty.2* -// CHECK5-NEXT: [[C1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], %struct._globalized_locals_ty.2* [[TMP3]], i32 0, i32 1 -// CHECK5-NEXT: [[D2:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], %struct._globalized_locals_ty.2* [[TMP3]], i32 0, i32 0 -// CHECK5-NEXT: store i8 0, i8* [[C1]], align 4 -// CHECK5-NEXT: store float 1.000000e+00, float* [[D2]], align 4 -// CHECK5-NEXT: [[TMP4:%.*]] = load i8, i8* [[C1]], align 4 -// CHECK5-NEXT: [[CONV:%.*]] = sext i8 [[TMP4]] to i32 -// CHECK5-NEXT: [[XOR:%.*]] = xor i32 [[CONV]], 2 -// CHECK5-NEXT: [[CONV3:%.*]] = trunc i32 [[XOR]] to i8 -// CHECK5-NEXT: store i8 [[CONV3]], i8* [[C1]], align 4 -// CHECK5-NEXT: [[TMP5:%.*]] = load float, float* [[D2]], align 4 -// CHECK5-NEXT: [[MUL:%.*]] = fmul float [[TMP5]], 3.300000e+01 -// CHECK5-NEXT: store float [[MUL]], float* [[D2]], align 4 -// CHECK5-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 -// CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK5-NEXT: store i8* [[C1]], i8** [[TMP8]], align 4 -// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP10:%.*]] = bitcast float* [[D2]] to i8* -// CHECK5-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 -// CHECK5-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK5-NEXT: [[TMP12:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 -// CHECK5-NEXT: [[TMP13:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], i8* [[TMP12]], i32 1024, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func3, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func4, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func5, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func6, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func7, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func8) -// CHECK5-NEXT: [[TMP14:%.*]] = icmp eq i32 [[TMP13]], 1 -// CHECK5-NEXT: br i1 [[TMP14]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] -// CHECK5: .omp.reduction.then: -// CHECK5-NEXT: [[TMP15:%.*]] = load i8, i8* [[TMP0]], align 1 -// CHECK5-NEXT: [[CONV4:%.*]] = sext i8 [[TMP15]] to i32 -// CHECK5-NEXT: [[TMP16:%.*]] = load i8, i8* [[C1]], align 4 -// CHECK5-NEXT: [[CONV5:%.*]] = sext i8 [[TMP16]] to i32 -// CHECK5-NEXT: [[XOR6:%.*]] = xor i32 [[CONV4]], [[CONV5]] -// CHECK5-NEXT: [[CONV7:%.*]] = trunc i32 [[XOR6]] to i8 -// CHECK5-NEXT: store i8 [[CONV7]], i8* [[TMP0]], align 1 -// CHECK5-NEXT: [[TMP17:%.*]] = load float, float* [[TMP1]], align 4 -// CHECK5-NEXT: [[TMP18:%.*]] = load float, float* [[D2]], align 4 -// CHECK5-NEXT: [[MUL8:%.*]] = fmul float [[TMP17]], [[TMP18]] -// CHECK5-NEXT: store float [[MUL8]], float* [[TMP1]], align 4 -// CHECK5-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP7]]) -// CHECK5-NEXT: br label [[DOTOMP_REDUCTION_DONE]] -// CHECK5: .omp.reduction.done: -// CHECK5-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP2]]) -// CHECK5-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func17 +// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.1* +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8 +// CHECK1-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* +// CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP7]] +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK1-NEXT: store i32 [[TMP12]], i32* [[TMP11]], align 128 +// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 8 +// CHECK1-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16* +// CHECK1-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 1 +// CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP7]] +// CHECK1-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP15]], align 2 +// CHECK1-NEXT: store i16 [[TMP17]], i16* [[TMP16]], align 128 +// CHECK1-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func3 -// CHECK5-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 -// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 -// CHECK5-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 -// CHECK5-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 -// CHECK5-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i8, align 1 -// CHECK5-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca float, align 4 -// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 -// CHECK5-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 -// CHECK5-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 -// CHECK5-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* -// CHECK5-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 -// CHECK5-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 -// CHECK5-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 -// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 -// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1 -// CHECK5-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP10]], align 1 -// CHECK5-NEXT: [[TMP14:%.*]] = sext i8 [[TMP13]] to i32 -// CHECK5-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK5-NEXT: [[TMP15:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 -// CHECK5-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP14]], i16 [[TMP7]], i16 [[TMP15]]) -// CHECK5-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8 -// CHECK5-NEXT: store i8 [[TMP17]], i8* [[DOTOMP_REDUCTION_ELEMENT]], align 1 -// CHECK5-NEXT: [[TMP18:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1 -// CHECK5-NEXT: [[TMP19:%.*]] = getelementptr i8, i8* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 -// CHECK5-NEXT: store i8* [[DOTOMP_REDUCTION_ELEMENT]], i8** [[TMP11]], align 4 -// CHECK5-NEXT: [[TMP20:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP21:%.*]] = load i8*, i8** [[TMP20]], align 4 -// CHECK5-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP23:%.*]] = bitcast i8* [[TMP21]] to float* -// CHECK5-NEXT: [[TMP24:%.*]] = getelementptr float, float* [[TMP23]], i32 1 -// CHECK5-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to i8* -// CHECK5-NEXT: [[TMP26:%.*]] = bitcast float* [[TMP23]] to i32* -// CHECK5-NEXT: [[TMP27:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i32* -// CHECK5-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP26]], align 4 -// CHECK5-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK5-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 -// CHECK5-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) -// CHECK5-NEXT: store i32 [[TMP30]], i32* [[TMP27]], align 4 -// CHECK5-NEXT: [[TMP31:%.*]] = getelementptr i32, i32* [[TMP26]], i32 1 -// CHECK5-NEXT: [[TMP32:%.*]] = getelementptr i32, i32* [[TMP27]], i32 1 -// CHECK5-NEXT: [[TMP33:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* -// CHECK5-NEXT: store i8* [[TMP33]], i8** [[TMP22]], align 4 -// CHECK5-NEXT: [[TMP34:%.*]] = icmp eq i16 [[TMP8]], 0 -// CHECK5-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK5-NEXT: [[TMP36:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] -// CHECK5-NEXT: [[TMP37:%.*]] = and i1 [[TMP35]], [[TMP36]] -// CHECK5-NEXT: [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 2 -// CHECK5-NEXT: [[TMP39:%.*]] = and i16 [[TMP6]], 1 -// CHECK5-NEXT: [[TMP40:%.*]] = icmp eq i16 [[TMP39]], 0 -// CHECK5-NEXT: [[TMP41:%.*]] = and i1 [[TMP38]], [[TMP40]] -// CHECK5-NEXT: [[TMP42:%.*]] = icmp sgt i16 [[TMP7]], 0 -// CHECK5-NEXT: [[TMP43:%.*]] = and i1 [[TMP41]], [[TMP42]] -// CHECK5-NEXT: [[TMP44:%.*]] = or i1 [[TMP34]], [[TMP37]] -// CHECK5-NEXT: [[TMP45:%.*]] = or i1 [[TMP44]], [[TMP43]] -// CHECK5-NEXT: br i1 [[TMP45]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK5: then: -// CHECK5-NEXT: [[TMP46:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* -// CHECK5-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* -// CHECK5-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP46]], i8* [[TMP47]]) #[[ATTR3]] -// CHECK5-NEXT: br label [[IFCONT:%.*]] -// CHECK5: else: -// CHECK5-NEXT: br label [[IFCONT]] -// CHECK5: ifcont: -// CHECK5-NEXT: [[TMP48:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK5-NEXT: [[TMP49:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] -// CHECK5-NEXT: [[TMP50:%.*]] = and i1 [[TMP48]], [[TMP49]] -// CHECK5-NEXT: br i1 [[TMP50]], label [[THEN6:%.*]], label [[ELSE7:%.*]] -// CHECK5: then6: -// CHECK5-NEXT: [[TMP51:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP52:%.*]] = load i8*, i8** [[TMP51]], align 4 -// CHECK5-NEXT: [[TMP53:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP54:%.*]] = load i8*, i8** [[TMP53]], align 4 -// CHECK5-NEXT: [[TMP55:%.*]] = load i8, i8* [[TMP52]], align 1 -// CHECK5-NEXT: store i8 [[TMP55]], i8* [[TMP54]], align 1 -// CHECK5-NEXT: [[TMP56:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP57:%.*]] = load i8*, i8** [[TMP56]], align 4 -// CHECK5-NEXT: [[TMP58:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP59:%.*]] = load i8*, i8** [[TMP58]], align 4 -// CHECK5-NEXT: [[TMP60:%.*]] = bitcast i8* [[TMP57]] to float* -// CHECK5-NEXT: [[TMP61:%.*]] = bitcast i8* [[TMP59]] to float* -// CHECK5-NEXT: [[TMP62:%.*]] = load float, float* [[TMP60]], align 4 -// CHECK5-NEXT: store float [[TMP62]], float* [[TMP61]], align 4 -// CHECK5-NEXT: br label [[IFCONT8:%.*]] -// CHECK5: else7: -// CHECK5-NEXT: br label [[IFCONT8]] -// CHECK5: ifcont8: -// CHECK5-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func18 +// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8 +// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.1* +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 +// CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP4]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP5]] +// CHECK1-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* +// CHECK1-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 8 +// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1 +// CHECK1-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP4]], i32 0, i32 1 +// CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP5]] +// CHECK1-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8* +// CHECK1-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 8 +// CHECK1-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK1-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 +// CHECK1-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP12]], i8* [[TMP13]]) #[[ATTR3]] +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func19 +// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.1* +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8 +// CHECK1-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* +// CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP7]] +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 128 +// CHECK1-NEXT: store i32 [[TMP12]], i32* [[TMP10]], align 4 +// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 8 +// CHECK1-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16* +// CHECK1-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 1 +// CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP7]] +// CHECK1-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP16]], align 128 +// CHECK1-NEXT: store i16 [[TMP17]], i16* [[TMP15]], align 2 +// CHECK1-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func4 -// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK5-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK5-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 -// CHECK5-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK5-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 -// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) -// CHECK5-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK5-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK5: then: -// CHECK5-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 -// CHECK5-NEXT: [[TMP7:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK5-NEXT: [[TMP8:%.*]] = bitcast i32 addrspace(3)* [[TMP7]] to i8 addrspace(3)* -// CHECK5-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP6]], align 1 -// CHECK5-NEXT: store volatile i8 [[TMP9]], i8 addrspace(3)* [[TMP8]], align 1 -// CHECK5-NEXT: br label [[IFCONT:%.*]] -// CHECK5: else: -// CHECK5-NEXT: br label [[IFCONT]] -// CHECK5: ifcont: -// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) -// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] -// CHECK5-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] -// CHECK5: then4: -// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK5-NEXT: [[TMP12:%.*]] = bitcast i32 addrspace(3)* [[TMP11]] to i8 addrspace(3)* -// CHECK5-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 -// CHECK5-NEXT: [[TMP15:%.*]] = load volatile i8, i8 addrspace(3)* [[TMP12]], align 1 -// CHECK5-NEXT: store i8 [[TMP15]], i8* [[TMP14]], align 1 -// CHECK5-NEXT: br label [[IFCONT6:%.*]] -// CHECK5: else5: -// CHECK5-NEXT: br label [[IFCONT6]] -// CHECK5: ifcont6: -// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) -// CHECK5-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK5-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] -// CHECK5: then8: -// CHECK5-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 -// CHECK5-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i32* -// CHECK5-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK5-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP18]], align 4 -// CHECK5-NEXT: store volatile i32 [[TMP20]], i32 addrspace(3)* [[TMP19]], align 4 -// CHECK5-NEXT: br label [[IFCONT10:%.*]] -// CHECK5: else9: -// CHECK5-NEXT: br label [[IFCONT10]] -// CHECK5: ifcont10: -// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) -// CHECK5-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP21]] -// CHECK5-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] -// CHECK5: then12: -// CHECK5-NEXT: [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK5-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP24:%.*]] = load i8*, i8** [[TMP23]], align 4 -// CHECK5-NEXT: [[TMP25:%.*]] = bitcast i8* [[TMP24]] to i32* -// CHECK5-NEXT: [[TMP26:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP22]], align 4 -// CHECK5-NEXT: store i32 [[TMP26]], i32* [[TMP25]], align 4 -// CHECK5-NEXT: br label [[IFCONT14:%.*]] -// CHECK5: else13: -// CHECK5-NEXT: br label [[IFCONT14]] -// CHECK5: ifcont14: -// CHECK5-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func20 +// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8 +// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.1* +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 +// CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP4]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP5]] +// CHECK1-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* +// CHECK1-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 8 +// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1 +// CHECK1-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP4]], i32 0, i32 1 +// CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP5]] +// CHECK1-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8* +// CHECK1-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 8 +// CHECK1-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK1-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 +// CHECK1-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP13]], i8* [[TMP12]]) #[[ATTR3]] +// CHECK1-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func5 -// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK5-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.3* -// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 -// CHECK5-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP7]] -// CHECK5-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]], align 1 -// CHECK5-NEXT: store i8 [[TMP11]], i8* [[TMP10]], align 128 -// CHECK5-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 -// CHECK5-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float* -// CHECK5-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP7]] -// CHECK5-NEXT: [[TMP16:%.*]] = load float, float* [[TMP14]], align 4 -// CHECK5-NEXT: store float [[TMP16]], float* [[TMP15]], align 128 -// CHECK5-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l21_worker +// CHECK2-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK2-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK2-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK2: .await.work: +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK2-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK2-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK2-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK2-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK2: .select.workers: +// CHECK2-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK2-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK2: .execute.parallel: +// CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK2-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK2: .terminate.parallel: +// CHECK2-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK2-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK2: .barrier.parallel: +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func6 -// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 -// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.3* -// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK5-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP5]] -// CHECK5-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 -// CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 -// CHECK5-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP5]] -// CHECK5-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8* -// CHECK5-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 4 -// CHECK5-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK5-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK5-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP11]], i8* [[TMP12]]) #[[ATTR3]] -// CHECK5-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l21 +// CHECK2-SAME: (double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1:[0-9]+]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[E_ADDR:%.*]] = alloca double*, align 4 +// CHECK2-NEXT: [[E7:%.*]] = alloca double, align 8 +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: store double* [[E]], double** [[E_ADDR]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4 +// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG8:![0-9]+]] +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG9:![0-9]+]] +// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10:![0-9]+]] +// CHECK2-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK2-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK2-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK2: .worker: +// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l21_worker() #[[ATTR3:[0-9]+]] +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .mastercheck: +// CHECK2-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG8]] +// CHECK2-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG9]] +// CHECK2-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10]] +// CHECK2-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK2-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK2-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1 +// CHECK2-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]] +// CHECK2-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK2-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK2: .master: +// CHECK2-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG9]] +// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10]] +// CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK2-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK2-NEXT: [[TMP7:%.*]] = load double, double* [[TMP0]], align 8 +// CHECK2-NEXT: store double [[TMP7]], double* [[E7]], align 8 +// CHECK2-NEXT: store i32 [[TMP6]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK2-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], double* [[E7]]) #[[ATTR3]] +// CHECK2-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK2: .termination.notifier: +// CHECK2-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: br label [[DOTEXIT]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[E_ADDR:%.*]] = alloca double*, align 4 +// CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK2-NEXT: store double* [[E]], double** [[E_ADDR]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4 +// CHECK2-NEXT: [[E1:%.*]] = call i8* @__kmpc_alloc_shared(i32 8) +// CHECK2-NEXT: [[E_ON_STACK:%.*]] = bitcast i8* [[E1]] to double* +// CHECK2-NEXT: store double 0.000000e+00, double* [[E_ON_STACK]], align 8 +// CHECK2-NEXT: [[TMP1:%.*]] = load double, double* [[E_ON_STACK]], align 8 +// CHECK2-NEXT: [[ADD:%.*]] = fadd double [[TMP1]], 5.000000e+00 +// CHECK2-NEXT: store double [[ADD]], double* [[E_ON_STACK]], align 8 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP5:%.*]] = bitcast double* [[E_ON_STACK]] to i8* +// CHECK2-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK2-NEXT: [[TMP7:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP3]], i8* [[TMP7]], i32 1024, i8* [[TMP6]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func) +// CHECK2-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 1 +// CHECK2-NEXT: br i1 [[TMP9]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK2: .omp.reduction.then: +// CHECK2-NEXT: [[TMP10:%.*]] = load double, double* [[TMP0]], align 8 +// CHECK2-NEXT: [[TMP11:%.*]] = load double, double* [[E_ON_STACK]], align 8 +// CHECK2-NEXT: [[ADD2:%.*]] = fadd double [[TMP10]], [[TMP11]] +// CHECK2-NEXT: store double [[ADD2]], double* [[TMP0]], align 8 +// CHECK2-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP3]]) +// CHECK2-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK2: .omp.reduction.done: +// CHECK2-NEXT: call void @__kmpc_free_shared(i8* [[E1]]) +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func +// CHECK2-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x i8*], align 4 +// CHECK2-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca double, align 8 +// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK2-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK2-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK2-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [1 x i8*]* +// CHECK2-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK2-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK2-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 +// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to double* +// CHECK2-NEXT: [[TMP13:%.*]] = getelementptr double, double* [[TMP12]], i32 1 +// CHECK2-NEXT: [[TMP14:%.*]] = bitcast double* [[TMP13]] to i8* +// CHECK2-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP12]] to i64* +// CHECK2-NEXT: [[TMP16:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i64* +// CHECK2-NEXT: [[TMP17:%.*]] = load i64, i64* [[TMP15]], align 8 +// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10]] +// CHECK2-NEXT: [[TMP18:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK2-NEXT: [[TMP19:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP17]], i16 [[TMP7]], i16 [[TMP18]]) +// CHECK2-NEXT: store i64 [[TMP19]], i64* [[TMP16]], align 8 +// CHECK2-NEXT: [[TMP20:%.*]] = getelementptr i64, i64* [[TMP15]], i32 1 +// CHECK2-NEXT: [[TMP21:%.*]] = getelementptr i64, i64* [[TMP16]], i32 1 +// CHECK2-NEXT: [[TMP22:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i8* +// CHECK2-NEXT: store i8* [[TMP22]], i8** [[TMP11]], align 4 +// CHECK2-NEXT: [[TMP23:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK2-NEXT: [[TMP24:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK2-NEXT: [[TMP25:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK2-NEXT: [[TMP26:%.*]] = and i1 [[TMP24]], [[TMP25]] +// CHECK2-NEXT: [[TMP27:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK2-NEXT: [[TMP28:%.*]] = and i16 [[TMP6]], 1 +// CHECK2-NEXT: [[TMP29:%.*]] = icmp eq i16 [[TMP28]], 0 +// CHECK2-NEXT: [[TMP30:%.*]] = and i1 [[TMP27]], [[TMP29]] +// CHECK2-NEXT: [[TMP31:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK2-NEXT: [[TMP32:%.*]] = and i1 [[TMP30]], [[TMP31]] +// CHECK2-NEXT: [[TMP33:%.*]] = or i1 [[TMP23]], [[TMP26]] +// CHECK2-NEXT: [[TMP34:%.*]] = or i1 [[TMP33]], [[TMP32]] +// CHECK2-NEXT: br i1 [[TMP34]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK2: then: +// CHECK2-NEXT: [[TMP35:%.*]] = bitcast [1 x i8*]* [[TMP5]] to i8* +// CHECK2-NEXT: [[TMP36:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK2-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP35]], i8* [[TMP36]]) #[[ATTR3]] +// CHECK2-NEXT: br label [[IFCONT:%.*]] +// CHECK2: else: +// CHECK2-NEXT: br label [[IFCONT]] +// CHECK2: ifcont: +// CHECK2-NEXT: [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK2-NEXT: [[TMP38:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK2-NEXT: [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]] +// CHECK2-NEXT: br i1 [[TMP39]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK2: then4: +// CHECK2-NEXT: [[TMP40:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP41:%.*]] = load i8*, i8** [[TMP40]], align 4 +// CHECK2-NEXT: [[TMP42:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP43:%.*]] = load i8*, i8** [[TMP42]], align 4 +// CHECK2-NEXT: [[TMP44:%.*]] = bitcast i8* [[TMP41]] to double* +// CHECK2-NEXT: [[TMP45:%.*]] = bitcast i8* [[TMP43]] to double* +// CHECK2-NEXT: [[TMP46:%.*]] = load double, double* [[TMP44]], align 8 +// CHECK2-NEXT: store double [[TMP46]], double* [[TMP45]], align 8 +// CHECK2-NEXT: br label [[IFCONT6:%.*]] +// CHECK2: else5: +// CHECK2-NEXT: br label [[IFCONT6]] +// CHECK2: ifcont6: +// CHECK2-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func7 -// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK5-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.3* -// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 -// CHECK5-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP7]] -// CHECK5-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP10]], align 128 -// CHECK5-NEXT: store i8 [[TMP11]], i8* [[TMP9]], align 1 -// CHECK5-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 -// CHECK5-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float* -// CHECK5-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP7]] -// CHECK5-NEXT: [[TMP16:%.*]] = load float, float* [[TMP15]], align 128 -// CHECK5-NEXT: store float [[TMP16]], float* [[TMP14]], align 4 -// CHECK5-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func +// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCNT_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG8]] +// CHECK2-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG8]] +// CHECK2-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK2-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG8]] +// CHECK2-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* +// CHECK2-NEXT: store i32 0, i32* [[DOTCNT_ADDR]], align 4 +// CHECK2-NEXT: br label [[PRECOND:%.*]] +// CHECK2: precond: +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCNT_ADDR]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = icmp ult i32 [[TMP5]], 2 +// CHECK2-NEXT: br i1 [[TMP6]], label [[BODY:%.*]], label [[EXIT:%.*]] +// CHECK2: body: +// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP2]]) +// CHECK2-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK2-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK2: then: +// CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP8:%.*]] = load i8*, i8** [[TMP7]], align 4 +// CHECK2-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32* +// CHECK2-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[TMP9]], i32 [[TMP5]] +// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK2-NEXT: store volatile i32 [[TMP12]], i32 addrspace(3)* [[TMP11]], align 4 +// CHECK2-NEXT: br label [[IFCONT:%.*]] +// CHECK2: else: +// CHECK2-NEXT: br label [[IFCONT]] +// CHECK2: ifcont: +// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP13]] +// CHECK2-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK2: then4: +// CHECK2-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK2-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 4 +// CHECK2-NEXT: [[TMP17:%.*]] = bitcast i8* [[TMP16]] to i32* +// CHECK2-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP17]], i32 [[TMP5]] +// CHECK2-NEXT: [[TMP19:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4 +// CHECK2-NEXT: store i32 [[TMP19]], i32* [[TMP18]], align 4 +// CHECK2-NEXT: br label [[IFCONT6:%.*]] +// CHECK2: else5: +// CHECK2-NEXT: br label [[IFCONT6]] +// CHECK2: ifcont6: +// CHECK2-NEXT: [[TMP20:%.*]] = add nsw i32 [[TMP5]], 1 +// CHECK2-NEXT: store i32 [[TMP20]], i32* [[DOTCNT_ADDR]], align 4 +// CHECK2-NEXT: br label [[PRECOND]] +// CHECK2: exit: +// CHECK2-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func8 -// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 -// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.3* -// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK5-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP5]] -// CHECK5-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 -// CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 -// CHECK5-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP5]] -// CHECK5-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8* -// CHECK5-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 4 -// CHECK5-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK5-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK5-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP12]], i8* [[TMP11]]) #[[ATTR3]] -// CHECK5-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func +// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* +// CHECK2-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty* +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK2-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double* +// CHECK2-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP6]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP7]] +// CHECK2-NEXT: [[TMP12:%.*]] = load double, double* [[TMP10]], align 8 +// CHECK2-NEXT: store double [[TMP12]], double* [[TMP11]], align 128 +// CHECK2-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l36 -// CHECK5-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[B]], i32* [[B_ADDR]], align 4 -// CHECK5-NEXT: [[CONV:%.*]] = bitcast i32* [[B_ADDR]] to i16* -// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK5-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK5-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() -// CHECK5-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK5: .execute: -// CHECK5-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) -// CHECK5-NEXT: store i32 [[TMP0]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK5-NEXT: call void @__omp_outlined__9(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[A_ADDR]], i16* [[CONV]]) #[[ATTR3]] -// CHECK5-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK5: .omp.deinit: -// CHECK5-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) -// CHECK5-NEXT: br label [[DOTEXIT:%.*]] -// CHECK5: .exit: -// CHECK5-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func +// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 +// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty* +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK2-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP4]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP5]] +// CHECK2-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8* +// CHECK2-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 +// CHECK2-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK2-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK2-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP9]], i8* [[TMP10]]) #[[ATTR3]] +// CHECK2-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__9 -// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 4 -// CHECK5-NEXT: [[A1:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[B2:%.*]] = alloca i16, align 2 -// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4 -// CHECK5-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 -// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 -// CHECK5-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 4 -// CHECK5-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 -// CHECK5-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4 -// CHECK5-NEXT: store i32 0, i32* [[A1]], align 4 -// CHECK5-NEXT: store i16 -32768, i16* [[B2]], align 2 -// CHECK5-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP3:%.*]] = bitcast i32* [[A1]] to i8* -// CHECK5-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 4 -// CHECK5-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP5:%.*]] = bitcast i16* [[B2]] to i8* -// CHECK5-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 4 -// CHECK5-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 -// CHECK5-NEXT: [[TMP8:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*)* @__omp_outlined__10 to i8*), i8* null, i8** [[TMP8]], i32 2) -// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP10:%.*]] = bitcast i32* [[A1]] to i8* -// CHECK5-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 -// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP12:%.*]] = bitcast i16* [[B2]] to i8* -// CHECK5-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 -// CHECK5-NEXT: [[TMP13:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK5-NEXT: [[TMP14:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 -// CHECK5-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i8* [[TMP14]], i32 1024, i8* [[TMP13]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func15, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func16, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func17, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func18, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func19, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func20) -// CHECK5-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP15]], 1 -// CHECK5-NEXT: br i1 [[TMP16]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] -// CHECK5: .omp.reduction.then: -// CHECK5-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP0]], align 4 -// CHECK5-NEXT: [[TMP18:%.*]] = load i32, i32* [[A1]], align 4 -// CHECK5-NEXT: [[OR:%.*]] = or i32 [[TMP17]], [[TMP18]] -// CHECK5-NEXT: store i32 [[OR]], i32* [[TMP0]], align 4 -// CHECK5-NEXT: [[TMP19:%.*]] = load i16, i16* [[TMP1]], align 2 -// CHECK5-NEXT: [[CONV:%.*]] = sext i16 [[TMP19]] to i32 -// CHECK5-NEXT: [[TMP20:%.*]] = load i16, i16* [[B2]], align 2 -// CHECK5-NEXT: [[CONV3:%.*]] = sext i16 [[TMP20]] to i32 -// CHECK5-NEXT: [[CMP:%.*]] = icmp sgt i32 [[CONV]], [[CONV3]] -// CHECK5-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK5: cond.true: -// CHECK5-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP1]], align 2 -// CHECK5-NEXT: br label [[COND_END:%.*]] -// CHECK5: cond.false: -// CHECK5-NEXT: [[TMP22:%.*]] = load i16, i16* [[B2]], align 2 -// CHECK5-NEXT: br label [[COND_END]] -// CHECK5: cond.end: -// CHECK5-NEXT: [[COND:%.*]] = phi i16 [ [[TMP21]], [[COND_TRUE]] ], [ [[TMP22]], [[COND_FALSE]] ] -// CHECK5-NEXT: store i16 [[COND]], i16* [[TMP1]], align 2 -// CHECK5-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP7]]) -// CHECK5-NEXT: br label [[DOTOMP_REDUCTION_DONE]] -// CHECK5: .omp.reduction.done: -// CHECK5-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func +// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* +// CHECK2-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty* +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK2-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double* +// CHECK2-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP6]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP7]] +// CHECK2-NEXT: [[TMP12:%.*]] = load double, double* [[TMP11]], align 128 +// CHECK2-NEXT: store double [[TMP12]], double* [[TMP10]], align 8 +// CHECK2-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__10 -// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 4 -// CHECK5-NEXT: [[A1:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[B2:%.*]] = alloca i16, align 2 -// CHECK5-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 -// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 -// CHECK5-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 4 -// CHECK5-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 -// CHECK5-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4 -// CHECK5-NEXT: store i32 0, i32* [[A1]], align 4 -// CHECK5-NEXT: store i16 -32768, i16* [[B2]], align 2 -// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[A1]], align 4 -// CHECK5-NEXT: [[OR:%.*]] = or i32 [[TMP2]], 1 -// CHECK5-NEXT: store i32 [[OR]], i32* [[A1]], align 4 -// CHECK5-NEXT: [[TMP3:%.*]] = load i16, i16* [[B2]], align 2 -// CHECK5-NEXT: [[CONV:%.*]] = sext i16 [[TMP3]] to i32 -// CHECK5-NEXT: [[CMP:%.*]] = icmp sgt i32 99, [[CONV]] -// CHECK5-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK5: cond.true: -// CHECK5-NEXT: br label [[COND_END:%.*]] -// CHECK5: cond.false: -// CHECK5-NEXT: [[TMP4:%.*]] = load i16, i16* [[B2]], align 2 -// CHECK5-NEXT: [[CONV3:%.*]] = sext i16 [[TMP4]] to i32 -// CHECK5-NEXT: br label [[COND_END]] -// CHECK5: cond.end: -// CHECK5-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[CONV3]], [[COND_FALSE]] ] -// CHECK5-NEXT: [[CONV4:%.*]] = trunc i32 [[COND]] to i16 -// CHECK5-NEXT: store i16 [[CONV4]], i16* [[B2]], align 2 -// CHECK5-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 -// CHECK5-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP8:%.*]] = bitcast i32* [[A1]] to i8* -// CHECK5-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 4 -// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP10:%.*]] = bitcast i16* [[B2]] to i8* -// CHECK5-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 -// CHECK5-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK5-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP6]], i32 2, i32 8, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func12, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func13) -// CHECK5-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP12]], 1 -// CHECK5-NEXT: br i1 [[TMP13]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] -// CHECK5: .omp.reduction.then: -// CHECK5-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP0]], align 4 -// CHECK5-NEXT: [[TMP15:%.*]] = load i32, i32* [[A1]], align 4 -// CHECK5-NEXT: [[OR5:%.*]] = or i32 [[TMP14]], [[TMP15]] -// CHECK5-NEXT: store i32 [[OR5]], i32* [[TMP0]], align 4 -// CHECK5-NEXT: [[TMP16:%.*]] = load i16, i16* [[TMP1]], align 2 -// CHECK5-NEXT: [[CONV6:%.*]] = sext i16 [[TMP16]] to i32 -// CHECK5-NEXT: [[TMP17:%.*]] = load i16, i16* [[B2]], align 2 -// CHECK5-NEXT: [[CONV7:%.*]] = sext i16 [[TMP17]] to i32 -// CHECK5-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[CONV6]], [[CONV7]] -// CHECK5-NEXT: br i1 [[CMP8]], label [[COND_TRUE9:%.*]], label [[COND_FALSE10:%.*]] -// CHECK5: cond.true9: -// CHECK5-NEXT: [[TMP18:%.*]] = load i16, i16* [[TMP1]], align 2 -// CHECK5-NEXT: br label [[COND_END11:%.*]] -// CHECK5: cond.false10: -// CHECK5-NEXT: [[TMP19:%.*]] = load i16, i16* [[B2]], align 2 -// CHECK5-NEXT: br label [[COND_END11]] -// CHECK5: cond.end11: -// CHECK5-NEXT: [[COND12:%.*]] = phi i16 [ [[TMP18]], [[COND_TRUE9]] ], [ [[TMP19]], [[COND_FALSE10]] ] -// CHECK5-NEXT: store i16 [[COND12]], i16* [[TMP1]], align 2 -// CHECK5-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP6]]) -// CHECK5-NEXT: br label [[DOTOMP_REDUCTION_DONE]] -// CHECK5: .omp.reduction.done: -// CHECK5-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func +// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 +// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty* +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK2-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP4]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP5]] +// CHECK2-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8* +// CHECK2-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 +// CHECK2-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK2-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK2-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP10]], i8* [[TMP9]]) #[[ATTR3]] +// CHECK2-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func12 -// CHECK5-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 -// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 -// CHECK5-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 -// CHECK5-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 -// CHECK5-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 -// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 -// CHECK5-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 -// CHECK5-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 -// CHECK5-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* -// CHECK5-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 -// CHECK5-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 -// CHECK5-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 -// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 -// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* -// CHECK5-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 -// CHECK5-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* -// CHECK5-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 -// CHECK5-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK5-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 -// CHECK5-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]]) -// CHECK5-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 -// CHECK5-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 -// CHECK5-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 -// CHECK5-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* -// CHECK5-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 4 -// CHECK5-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 4 -// CHECK5-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16* -// CHECK5-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 -// CHECK5-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8* -// CHECK5-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2 -// CHECK5-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32 -// CHECK5-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK5-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 -// CHECK5-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) -// CHECK5-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 -// CHECK5-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 -// CHECK5-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 -// CHECK5-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1 -// CHECK5-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* -// CHECK5-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 4 -// CHECK5-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0 -// CHECK5-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK5-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] -// CHECK5-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]] -// CHECK5-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2 -// CHECK5-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1 -// CHECK5-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0 -// CHECK5-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]] -// CHECK5-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0 -// CHECK5-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]] -// CHECK5-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]] -// CHECK5-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]] -// CHECK5-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK5: then: -// CHECK5-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* -// CHECK5-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* -// CHECK5-NEXT: call void @"_omp$reduction$reduction_func11"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]] -// CHECK5-NEXT: br label [[IFCONT:%.*]] -// CHECK5: else: -// CHECK5-NEXT: br label [[IFCONT]] -// CHECK5: ifcont: -// CHECK5-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK5-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] -// CHECK5-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]] -// CHECK5-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]] -// CHECK5: then6: -// CHECK5-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 4 -// CHECK5-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4 -// CHECK5-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32* -// CHECK5-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32* -// CHECK5-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4 -// CHECK5-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4 -// CHECK5-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 4 -// CHECK5-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4 -// CHECK5-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16* -// CHECK5-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16* -// CHECK5-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2 -// CHECK5-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2 -// CHECK5-NEXT: br label [[IFCONT8:%.*]] -// CHECK5: else7: -// CHECK5-NEXT: br label [[IFCONT8]] -// CHECK5: ifcont8: -// CHECK5-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l27_worker +// CHECK2-SAME: () #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK2-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK2-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK2: .await.work: +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK2-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK2-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK2-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK2-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK2: .select.workers: +// CHECK2-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK2-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK2: .execute.parallel: +// CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK2-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK2: .terminate.parallel: +// CHECK2-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK2-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK2: .barrier.parallel: +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l27 +// CHECK2-SAME: (i32 [[C:%.*]], i32 [[D:%.*]]) #[[ATTR1]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[D_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[C]], i32* [[C_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[D]], i32* [[D_ADDR]], align 4 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i32* [[C_ADDR]] to i8* +// CHECK2-NEXT: [[CONV1:%.*]] = bitcast i32* [[D_ADDR]] to float* +// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG8]] +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG9]] +// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10]] +// CHECK2-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK2-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK2-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK2: .worker: +// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l27_worker() #[[ATTR3]] +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .mastercheck: +// CHECK2-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG8]] +// CHECK2-NEXT: [[NVPTX_NUM_THREADS3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG9]] +// CHECK2-NEXT: [[NVPTX_WARP_SIZE4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10]] +// CHECK2-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE4]], 1 +// CHECK2-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS3]], 1 +// CHECK2-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK2-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK2-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID2]], [[MASTER_TID]] +// CHECK2-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK2: .master: +// CHECK2-NEXT: [[NVPTX_NUM_THREADS5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG9]] +// CHECK2-NEXT: [[NVPTX_WARP_SIZE6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10]] +// CHECK2-NEXT: [[THREAD_LIMIT7:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS5]], [[NVPTX_WARP_SIZE6]] +// CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT7]], i16 1) +// CHECK2-NEXT: [[TMP5:%.*]] = load i8, i8* [[CONV]], align 4 +// CHECK2-NEXT: [[C8:%.*]] = call i8* @__kmpc_alloc_shared(i32 1) +// CHECK2-NEXT: store i8 [[TMP5]], i8* [[C8]], align 1 +// CHECK2-NEXT: [[TMP6:%.*]] = load float, float* [[CONV1]], align 4 +// CHECK2-NEXT: [[D9:%.*]] = call i8* @__kmpc_alloc_shared(i32 4) +// CHECK2-NEXT: [[D_ON_STACK:%.*]] = bitcast i8* [[D9]] to float* +// CHECK2-NEXT: store float [[TMP6]], float* [[D_ON_STACK]], align 4 +// CHECK2-NEXT: [[TMP7:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK2-NEXT: store i32 [[TMP7]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK2-NEXT: call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8* [[C8]], float* [[D_ON_STACK]]) #[[ATTR3]] +// CHECK2-NEXT: call void @__kmpc_free_shared(i8* [[D9]]) +// CHECK2-NEXT: call void @__kmpc_free_shared(i8* [[C8]]) +// CHECK2-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK2: .termination.notifier: +// CHECK2-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: br label [[DOTEXIT]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8* nonnull align 1 dereferenceable(1) [[C:%.*]], float* nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[D_ADDR:%.*]] = alloca float*, align 4 +// CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK2-NEXT: store i8* [[C]], i8** [[C_ADDR]], align 4 +// CHECK2-NEXT: store float* [[D]], float** [[D_ADDR]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load i8*, i8** [[C_ADDR]], align 4 +// CHECK2-NEXT: [[TMP1:%.*]] = load float*, float** [[D_ADDR]], align 4 +// CHECK2-NEXT: [[C1:%.*]] = call i8* @__kmpc_alloc_shared(i32 1) +// CHECK2-NEXT: [[D2:%.*]] = call i8* @__kmpc_alloc_shared(i32 4) +// CHECK2-NEXT: [[D_ON_STACK:%.*]] = bitcast i8* [[D2]] to float* +// CHECK2-NEXT: store i8 0, i8* [[C1]], align 1 +// CHECK2-NEXT: store float 1.000000e+00, float* [[D_ON_STACK]], align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = load i8, i8* [[C1]], align 1 +// CHECK2-NEXT: [[CONV:%.*]] = sext i8 [[TMP2]] to i32 +// CHECK2-NEXT: [[XOR:%.*]] = xor i32 [[CONV]], 2 +// CHECK2-NEXT: [[CONV3:%.*]] = trunc i32 [[XOR]] to i8 +// CHECK2-NEXT: store i8 [[CONV3]], i8* [[C1]], align 1 +// CHECK2-NEXT: [[TMP3:%.*]] = load float, float* [[D_ON_STACK]], align 4 +// CHECK2-NEXT: [[MUL:%.*]] = fmul float [[TMP3]], 3.300000e+01 +// CHECK2-NEXT: store float [[MUL]], float* [[D_ON_STACK]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK2-NEXT: store i8* [[C1]], i8** [[TMP6]], align 4 +// CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP8:%.*]] = bitcast float* [[D_ON_STACK]] to i8* +// CHECK2-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 4 +// CHECK2-NEXT: [[TMP9:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK2-NEXT: [[TMP10:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 +// CHECK2-NEXT: [[TMP11:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i8* [[TMP10]], i32 1024, i8* [[TMP9]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func3, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func4, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func5, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func6, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func7, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func8) +// CHECK2-NEXT: [[TMP12:%.*]] = icmp eq i32 [[TMP11]], 1 +// CHECK2-NEXT: br i1 [[TMP12]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK2: .omp.reduction.then: +// CHECK2-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP0]], align 1 +// CHECK2-NEXT: [[CONV4:%.*]] = sext i8 [[TMP13]] to i32 +// CHECK2-NEXT: [[TMP14:%.*]] = load i8, i8* [[C1]], align 1 +// CHECK2-NEXT: [[CONV5:%.*]] = sext i8 [[TMP14]] to i32 +// CHECK2-NEXT: [[XOR6:%.*]] = xor i32 [[CONV4]], [[CONV5]] +// CHECK2-NEXT: [[CONV7:%.*]] = trunc i32 [[XOR6]] to i8 +// CHECK2-NEXT: store i8 [[CONV7]], i8* [[TMP0]], align 1 +// CHECK2-NEXT: [[TMP15:%.*]] = load float, float* [[TMP1]], align 4 +// CHECK2-NEXT: [[TMP16:%.*]] = load float, float* [[D_ON_STACK]], align 4 +// CHECK2-NEXT: [[MUL8:%.*]] = fmul float [[TMP15]], [[TMP16]] +// CHECK2-NEXT: store float [[MUL8]], float* [[TMP1]], align 4 +// CHECK2-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]]) +// CHECK2-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK2: .omp.reduction.done: +// CHECK2-NEXT: call void @__kmpc_free_shared(i8* [[D2]]) +// CHECK2-NEXT: call void @__kmpc_free_shared(i8* [[C1]]) +// CHECK2-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func13 -// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK5-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK5-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 -// CHECK5-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK5-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 -// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4:[0-9]+]], i32 [[TMP2]]) -// CHECK5-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK5-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK5: then: -// CHECK5-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 -// CHECK5-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* -// CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK5-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4 -// CHECK5-NEXT: br label [[IFCONT:%.*]] -// CHECK5: else: -// CHECK5-NEXT: br label [[IFCONT]] -// CHECK5: ifcont: -// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] -// CHECK5-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] -// CHECK5: then4: -// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK5-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 -// CHECK5-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32* -// CHECK5-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4 -// CHECK5-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4 -// CHECK5-NEXT: br label [[IFCONT6:%.*]] -// CHECK5: else5: -// CHECK5-NEXT: br label [[IFCONT6]] -// CHECK5: ifcont6: -// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK5-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK5-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] -// CHECK5: then8: -// CHECK5-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 -// CHECK5-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16* -// CHECK5-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK5-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)* -// CHECK5-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2 -// CHECK5-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2 -// CHECK5-NEXT: br label [[IFCONT10:%.*]] -// CHECK5: else9: -// CHECK5-NEXT: br label [[IFCONT10]] -// CHECK5: ifcont10: -// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK5-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]] -// CHECK5-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] -// CHECK5: then12: -// CHECK5-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK5-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)* -// CHECK5-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 4 -// CHECK5-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16* -// CHECK5-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2 -// CHECK5-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2 -// CHECK5-NEXT: br label [[IFCONT14:%.*]] -// CHECK5: else13: -// CHECK5-NEXT: br label [[IFCONT14]] -// CHECK5: ifcont14: -// CHECK5-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func3 +// CHECK2-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK2-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i8, align 1 +// CHECK2-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca float, align 4 +// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK2-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK2-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK2-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* +// CHECK2-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK2-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK2-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 +// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1 +// CHECK2-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP10]], align 1 +// CHECK2-NEXT: [[TMP14:%.*]] = sext i8 [[TMP13]] to i32 +// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10]] +// CHECK2-NEXT: [[TMP15:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK2-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP14]], i16 [[TMP7]], i16 [[TMP15]]) +// CHECK2-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8 +// CHECK2-NEXT: store i8 [[TMP17]], i8* [[DOTOMP_REDUCTION_ELEMENT]], align 1 +// CHECK2-NEXT: [[TMP18:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1 +// CHECK2-NEXT: [[TMP19:%.*]] = getelementptr i8, i8* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 +// CHECK2-NEXT: store i8* [[DOTOMP_REDUCTION_ELEMENT]], i8** [[TMP11]], align 4 +// CHECK2-NEXT: [[TMP20:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP21:%.*]] = load i8*, i8** [[TMP20]], align 4 +// CHECK2-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP23:%.*]] = bitcast i8* [[TMP21]] to float* +// CHECK2-NEXT: [[TMP24:%.*]] = getelementptr float, float* [[TMP23]], i32 1 +// CHECK2-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to i8* +// CHECK2-NEXT: [[TMP26:%.*]] = bitcast float* [[TMP23]] to i32* +// CHECK2-NEXT: [[TMP27:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i32* +// CHECK2-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10]] +// CHECK2-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 +// CHECK2-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) +// CHECK2-NEXT: store i32 [[TMP30]], i32* [[TMP27]], align 4 +// CHECK2-NEXT: [[TMP31:%.*]] = getelementptr i32, i32* [[TMP26]], i32 1 +// CHECK2-NEXT: [[TMP32:%.*]] = getelementptr i32, i32* [[TMP27]], i32 1 +// CHECK2-NEXT: [[TMP33:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* +// CHECK2-NEXT: store i8* [[TMP33]], i8** [[TMP22]], align 4 +// CHECK2-NEXT: [[TMP34:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK2-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK2-NEXT: [[TMP36:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK2-NEXT: [[TMP37:%.*]] = and i1 [[TMP35]], [[TMP36]] +// CHECK2-NEXT: [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK2-NEXT: [[TMP39:%.*]] = and i16 [[TMP6]], 1 +// CHECK2-NEXT: [[TMP40:%.*]] = icmp eq i16 [[TMP39]], 0 +// CHECK2-NEXT: [[TMP41:%.*]] = and i1 [[TMP38]], [[TMP40]] +// CHECK2-NEXT: [[TMP42:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK2-NEXT: [[TMP43:%.*]] = and i1 [[TMP41]], [[TMP42]] +// CHECK2-NEXT: [[TMP44:%.*]] = or i1 [[TMP34]], [[TMP37]] +// CHECK2-NEXT: [[TMP45:%.*]] = or i1 [[TMP44]], [[TMP43]] +// CHECK2-NEXT: br i1 [[TMP45]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK2: then: +// CHECK2-NEXT: [[TMP46:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* +// CHECK2-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK2-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP46]], i8* [[TMP47]]) #[[ATTR3]] +// CHECK2-NEXT: br label [[IFCONT:%.*]] +// CHECK2: else: +// CHECK2-NEXT: br label [[IFCONT]] +// CHECK2: ifcont: +// CHECK2-NEXT: [[TMP48:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK2-NEXT: [[TMP49:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK2-NEXT: [[TMP50:%.*]] = and i1 [[TMP48]], [[TMP49]] +// CHECK2-NEXT: br i1 [[TMP50]], label [[THEN6:%.*]], label [[ELSE7:%.*]] +// CHECK2: then6: +// CHECK2-NEXT: [[TMP51:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP52:%.*]] = load i8*, i8** [[TMP51]], align 4 +// CHECK2-NEXT: [[TMP53:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP54:%.*]] = load i8*, i8** [[TMP53]], align 4 +// CHECK2-NEXT: [[TMP55:%.*]] = load i8, i8* [[TMP52]], align 1 +// CHECK2-NEXT: store i8 [[TMP55]], i8* [[TMP54]], align 1 +// CHECK2-NEXT: [[TMP56:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP57:%.*]] = load i8*, i8** [[TMP56]], align 4 +// CHECK2-NEXT: [[TMP58:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP59:%.*]] = load i8*, i8** [[TMP58]], align 4 +// CHECK2-NEXT: [[TMP60:%.*]] = bitcast i8* [[TMP57]] to float* +// CHECK2-NEXT: [[TMP61:%.*]] = bitcast i8* [[TMP59]] to float* +// CHECK2-NEXT: [[TMP62:%.*]] = load float, float* [[TMP60]], align 4 +// CHECK2-NEXT: store float [[TMP62]], float* [[TMP61]], align 4 +// CHECK2-NEXT: br label [[IFCONT8:%.*]] +// CHECK2: else7: +// CHECK2-NEXT: br label [[IFCONT8]] +// CHECK2: ifcont8: +// CHECK2-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func15 -// CHECK5-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 -// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 -// CHECK5-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 -// CHECK5-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 -// CHECK5-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 -// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 -// CHECK5-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 -// CHECK5-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 -// CHECK5-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* -// CHECK5-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 -// CHECK5-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 -// CHECK5-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 -// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 -// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* -// CHECK5-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 -// CHECK5-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* -// CHECK5-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 -// CHECK5-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK5-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 -// CHECK5-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]]) -// CHECK5-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 -// CHECK5-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 -// CHECK5-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 -// CHECK5-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* -// CHECK5-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 4 -// CHECK5-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 4 -// CHECK5-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16* -// CHECK5-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 -// CHECK5-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8* -// CHECK5-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2 -// CHECK5-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32 -// CHECK5-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK5-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 -// CHECK5-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) -// CHECK5-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 -// CHECK5-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 -// CHECK5-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 -// CHECK5-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1 -// CHECK5-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* -// CHECK5-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 4 -// CHECK5-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0 -// CHECK5-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK5-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] -// CHECK5-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]] -// CHECK5-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2 -// CHECK5-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1 -// CHECK5-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0 -// CHECK5-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]] -// CHECK5-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0 -// CHECK5-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]] -// CHECK5-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]] -// CHECK5-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]] -// CHECK5-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK5: then: -// CHECK5-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* -// CHECK5-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* -// CHECK5-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]] -// CHECK5-NEXT: br label [[IFCONT:%.*]] -// CHECK5: else: -// CHECK5-NEXT: br label [[IFCONT]] -// CHECK5: ifcont: -// CHECK5-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK5-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] -// CHECK5-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]] -// CHECK5-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]] -// CHECK5: then6: -// CHECK5-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 4 -// CHECK5-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4 -// CHECK5-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32* -// CHECK5-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32* -// CHECK5-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4 -// CHECK5-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4 -// CHECK5-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 4 -// CHECK5-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4 -// CHECK5-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16* -// CHECK5-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16* -// CHECK5-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2 -// CHECK5-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2 -// CHECK5-NEXT: br label [[IFCONT8:%.*]] -// CHECK5: else7: -// CHECK5-NEXT: br label [[IFCONT8]] -// CHECK5: ifcont8: -// CHECK5-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func4 +// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG8]] +// CHECK2-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG8]] +// CHECK2-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK2-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG8]] +// CHECK2-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK2-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK2-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK2: then: +// CHECK2-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 +// CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK2-NEXT: [[TMP8:%.*]] = bitcast i32 addrspace(3)* [[TMP7]] to i8 addrspace(3)* +// CHECK2-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP6]], align 1 +// CHECK2-NEXT: store volatile i8 [[TMP9]], i8 addrspace(3)* [[TMP8]], align 1 +// CHECK2-NEXT: br label [[IFCONT:%.*]] +// CHECK2: else: +// CHECK2-NEXT: br label [[IFCONT]] +// CHECK2: ifcont: +// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] +// CHECK2-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK2: then4: +// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK2-NEXT: [[TMP12:%.*]] = bitcast i32 addrspace(3)* [[TMP11]] to i8 addrspace(3)* +// CHECK2-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 +// CHECK2-NEXT: [[TMP15:%.*]] = load volatile i8, i8 addrspace(3)* [[TMP12]], align 1 +// CHECK2-NEXT: store i8 [[TMP15]], i8* [[TMP14]], align 1 +// CHECK2-NEXT: br label [[IFCONT6:%.*]] +// CHECK2: else5: +// CHECK2-NEXT: br label [[IFCONT6]] +// CHECK2: ifcont6: +// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK2-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK2-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] +// CHECK2: then8: +// CHECK2-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 +// CHECK2-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i32* +// CHECK2-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK2-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP18]], align 4 +// CHECK2-NEXT: store volatile i32 [[TMP20]], i32 addrspace(3)* [[TMP19]], align 4 +// CHECK2-NEXT: br label [[IFCONT10:%.*]] +// CHECK2: else9: +// CHECK2-NEXT: br label [[IFCONT10]] +// CHECK2: ifcont10: +// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK2-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP21]] +// CHECK2-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] +// CHECK2: then12: +// CHECK2-NEXT: [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK2-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP24:%.*]] = load i8*, i8** [[TMP23]], align 4 +// CHECK2-NEXT: [[TMP25:%.*]] = bitcast i8* [[TMP24]] to i32* +// CHECK2-NEXT: [[TMP26:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP22]], align 4 +// CHECK2-NEXT: store i32 [[TMP26]], i32* [[TMP25]], align 4 +// CHECK2-NEXT: br label [[IFCONT14:%.*]] +// CHECK2: else13: +// CHECK2-NEXT: br label [[IFCONT14]] +// CHECK2: ifcont14: +// CHECK2-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func16 -// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK5-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK5-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 -// CHECK5-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK5-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 -// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK5-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK5-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK5: then: -// CHECK5-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 -// CHECK5-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* -// CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK5-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4 -// CHECK5-NEXT: br label [[IFCONT:%.*]] -// CHECK5: else: -// CHECK5-NEXT: br label [[IFCONT]] -// CHECK5: ifcont: -// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] -// CHECK5-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] -// CHECK5: then4: -// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK5-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 -// CHECK5-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32* -// CHECK5-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4 -// CHECK5-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4 -// CHECK5-NEXT: br label [[IFCONT6:%.*]] -// CHECK5: else5: -// CHECK5-NEXT: br label [[IFCONT6]] -// CHECK5: ifcont6: -// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK5-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK5-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] -// CHECK5: then8: -// CHECK5-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 -// CHECK5-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16* -// CHECK5-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK5-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)* -// CHECK5-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2 -// CHECK5-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2 -// CHECK5-NEXT: br label [[IFCONT10:%.*]] -// CHECK5: else9: -// CHECK5-NEXT: br label [[IFCONT10]] -// CHECK5: ifcont10: -// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK5-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]] -// CHECK5-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] -// CHECK5: then12: -// CHECK5-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK5-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)* -// CHECK5-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 4 -// CHECK5-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16* -// CHECK5-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2 -// CHECK5-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2 -// CHECK5-NEXT: br label [[IFCONT14:%.*]] -// CHECK5: else13: -// CHECK5-NEXT: br label [[IFCONT14]] -// CHECK5: ifcont14: -// CHECK5-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func5 +// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK2-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0* +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK2-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP7]] +// CHECK2-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]], align 1 +// CHECK2-NEXT: store i8 [[TMP11]], i8* [[TMP10]], align 128 +// CHECK2-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 +// CHECK2-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float* +// CHECK2-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP7]] +// CHECK2-NEXT: [[TMP16:%.*]] = load float, float* [[TMP14]], align 4 +// CHECK2-NEXT: store float [[TMP16]], float* [[TMP15]], align 128 +// CHECK2-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func17 -// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK5-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.4* -// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 -// CHECK5-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* -// CHECK5-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP7]] -// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 -// CHECK5-NEXT: store i32 [[TMP12]], i32* [[TMP11]], align 128 -// CHECK5-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 -// CHECK5-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16* -// CHECK5-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP16:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP7]] -// CHECK5-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP15]], align 2 -// CHECK5-NEXT: store i16 [[TMP17]], i16* [[TMP16]], align 128 -// CHECK5-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func6 +// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0* +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK2-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP5]] +// CHECK2-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK2-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP5]] +// CHECK2-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8* +// CHECK2-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 4 +// CHECK2-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK2-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK2-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP11]], i8* [[TMP12]]) #[[ATTR3]] +// CHECK2-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func18 -// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 -// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.4* -// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK5-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP5]] -// CHECK5-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* -// CHECK5-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 -// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 -// CHECK5-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP5]] -// CHECK5-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8* -// CHECK5-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 4 -// CHECK5-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK5-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK5-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP12]], i8* [[TMP13]]) #[[ATTR3]] -// CHECK5-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func7 +// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK2-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0* +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK2-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP7]] +// CHECK2-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP10]], align 128 +// CHECK2-NEXT: store i8 [[TMP11]], i8* [[TMP9]], align 1 +// CHECK2-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 +// CHECK2-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float* +// CHECK2-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP7]] +// CHECK2-NEXT: [[TMP16:%.*]] = load float, float* [[TMP15]], align 128 +// CHECK2-NEXT: store float [[TMP16]], float* [[TMP14]], align 4 +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func8 +// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0* +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK2-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP5]] +// CHECK2-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK2-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP5]] +// CHECK2-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8* +// CHECK2-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 4 +// CHECK2-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK2-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK2-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP12]], i8* [[TMP11]]) #[[ATTR3]] +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l34 +// CHECK2-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[B]], i32* [[B_ADDR]], align 4 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i32* [[B_ADDR]] to i16* +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG9]] +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) +// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK2: .execute: +// CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK2-NEXT: store i32 [[TMP0]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK2-NEXT: call void @__omp_outlined__9(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[A_ADDR]], i16* [[CONV]]) #[[ATTR3]] +// CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK2: .omp.deinit: +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func19 -// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK5-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.4* -// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 -// CHECK5-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* -// CHECK5-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP7]] -// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 128 -// CHECK5-NEXT: store i32 [[TMP12]], i32* [[TMP10]], align 4 -// CHECK5-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 -// CHECK5-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16* -// CHECK5-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP16:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP7]] -// CHECK5-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP16]], align 128 -// CHECK5-NEXT: store i16 [[TMP17]], i16* [[TMP15]], align 2 -// CHECK5-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__9 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 4 +// CHECK2-NEXT: [[A1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[B2:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4 +// CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK2-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK2-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[A1]], align 4 +// CHECK2-NEXT: store i16 -32768, i16* [[B2]], align 2 +// CHECK2-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP3:%.*]] = bitcast i32* [[A1]] to i8* +// CHECK2-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i16* [[B2]] to i8* +// CHECK2-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*)* @__omp_outlined__10 to i8*), i8* null, i8** [[TMP8]], i32 2) +// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP10:%.*]] = bitcast i32* [[A1]] to i8* +// CHECK2-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP12:%.*]] = bitcast i16* [[B2]] to i8* +// CHECK2-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK2-NEXT: [[TMP13:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK2-NEXT: [[TMP14:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 +// CHECK2-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i8* [[TMP14]], i32 1024, i8* [[TMP13]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func15, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func16, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func17, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func18, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func19, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func20) +// CHECK2-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP15]], 1 +// CHECK2-NEXT: br i1 [[TMP16]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK2: .omp.reduction.then: +// CHECK2-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[A1]], align 4 +// CHECK2-NEXT: [[OR:%.*]] = or i32 [[TMP17]], [[TMP18]] +// CHECK2-NEXT: store i32 [[OR]], i32* [[TMP0]], align 4 +// CHECK2-NEXT: [[TMP19:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK2-NEXT: [[CONV:%.*]] = sext i16 [[TMP19]] to i32 +// CHECK2-NEXT: [[TMP20:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK2-NEXT: [[CONV3:%.*]] = sext i16 [[TMP20]] to i32 +// CHECK2-NEXT: [[CMP:%.*]] = icmp sgt i32 [[CONV]], [[CONV3]] +// CHECK2-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK2: cond.true: +// CHECK2-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK2-NEXT: br label [[COND_END:%.*]] +// CHECK2: cond.false: +// CHECK2-NEXT: [[TMP22:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK2-NEXT: br label [[COND_END]] +// CHECK2: cond.end: +// CHECK2-NEXT: [[COND:%.*]] = phi i16 [ [[TMP21]], [[COND_TRUE]] ], [ [[TMP22]], [[COND_FALSE]] ] +// CHECK2-NEXT: store i16 [[COND]], i16* [[TMP1]], align 2 +// CHECK2-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP7]]) +// CHECK2-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK2: .omp.reduction.done: +// CHECK2-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func20 -// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 -// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.4* -// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK5-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP5]] -// CHECK5-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* -// CHECK5-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 -// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 -// CHECK5-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP5]] -// CHECK5-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8* -// CHECK5-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 4 -// CHECK5-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK5-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK5-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP13]], i8* [[TMP12]]) #[[ATTR3]] -// CHECK5-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__10 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 4 +// CHECK2-NEXT: [[A1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[B2:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK2-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK2-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[A1]], align 4 +// CHECK2-NEXT: store i16 -32768, i16* [[B2]], align 2 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[A1]], align 4 +// CHECK2-NEXT: [[OR:%.*]] = or i32 [[TMP2]], 1 +// CHECK2-NEXT: store i32 [[OR]], i32* [[A1]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK2-NEXT: [[CONV:%.*]] = sext i16 [[TMP3]] to i32 +// CHECK2-NEXT: [[CMP:%.*]] = icmp sgt i32 99, [[CONV]] +// CHECK2-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK2: cond.true: +// CHECK2-NEXT: br label [[COND_END:%.*]] +// CHECK2: cond.false: +// CHECK2-NEXT: [[TMP4:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK2-NEXT: [[CONV3:%.*]] = sext i16 [[TMP4]] to i32 +// CHECK2-NEXT: br label [[COND_END]] +// CHECK2: cond.end: +// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[CONV3]], [[COND_FALSE]] ] +// CHECK2-NEXT: [[CONV4:%.*]] = trunc i32 [[COND]] to i16 +// CHECK2-NEXT: store i16 [[CONV4]], i16* [[B2]], align 2 +// CHECK2-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP8:%.*]] = bitcast i32* [[A1]] to i8* +// CHECK2-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 4 +// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP10:%.*]] = bitcast i16* [[B2]] to i8* +// CHECK2-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK2-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK2-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP6]], i32 2, i32 8, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func12, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func13) +// CHECK2-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP12]], 1 +// CHECK2-NEXT: br i1 [[TMP13]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK2: .omp.reduction.then: +// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[A1]], align 4 +// CHECK2-NEXT: [[OR5:%.*]] = or i32 [[TMP14]], [[TMP15]] +// CHECK2-NEXT: store i32 [[OR5]], i32* [[TMP0]], align 4 +// CHECK2-NEXT: [[TMP16:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK2-NEXT: [[CONV6:%.*]] = sext i16 [[TMP16]] to i32 +// CHECK2-NEXT: [[TMP17:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK2-NEXT: [[CONV7:%.*]] = sext i16 [[TMP17]] to i32 +// CHECK2-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[CONV6]], [[CONV7]] +// CHECK2-NEXT: br i1 [[CMP8]], label [[COND_TRUE9:%.*]], label [[COND_FALSE10:%.*]] +// CHECK2: cond.true9: +// CHECK2-NEXT: [[TMP18:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK2-NEXT: br label [[COND_END11:%.*]] +// CHECK2: cond.false10: +// CHECK2-NEXT: [[TMP19:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK2-NEXT: br label [[COND_END11]] +// CHECK2: cond.end11: +// CHECK2-NEXT: [[COND12:%.*]] = phi i16 [ [[TMP18]], [[COND_TRUE9]] ], [ [[TMP19]], [[COND_FALSE10]] ] +// CHECK2-NEXT: store i16 [[COND12]], i16* [[TMP1]], align 2 +// CHECK2-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP6]]) +// CHECK2-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK2: .omp.reduction.done: +// CHECK2-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_worker -// CHECK6-SAME: () #[[ATTR0:[0-9]+]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 -// CHECK6-NEXT: store i8* null, i8** [[WORK_FN]], align 4 -// CHECK6-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 -// CHECK6-NEXT: br label [[DOTAWAIT_WORK:%.*]] -// CHECK6: .await.work: -// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK6-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) -// CHECK6-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 -// CHECK6-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 -// CHECK6-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 -// CHECK6-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null -// CHECK6-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] -// CHECK6: .select.workers: -// CHECK6-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 -// CHECK6-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK6-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] -// CHECK6: .execute.parallel: -// CHECK6-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) -// CHECK6-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* -// CHECK6-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) -// CHECK6-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] -// CHECK6: .terminate.parallel: -// CHECK6-NEXT: call void @__kmpc_kernel_end_parallel() -// CHECK6-NEXT: br label [[DOTBARRIER_PARALLEL]] -// CHECK6: .barrier.parallel: -// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK6-NEXT: br label [[DOTAWAIT_WORK]] -// CHECK6: .exit: -// CHECK6-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func12 +// CHECK2-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK2-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK2-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK2-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK2-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* +// CHECK2-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK2-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK2-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 +// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* +// CHECK2-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 +// CHECK2-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* +// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 +// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10]] +// CHECK2-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK2-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]]) +// CHECK2-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 +// CHECK2-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 +// CHECK2-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 +// CHECK2-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* +// CHECK2-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 4 +// CHECK2-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 4 +// CHECK2-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16* +// CHECK2-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 +// CHECK2-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8* +// CHECK2-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2 +// CHECK2-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32 +// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10]] +// CHECK2-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 +// CHECK2-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) +// CHECK2-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 +// CHECK2-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 +// CHECK2-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 +// CHECK2-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1 +// CHECK2-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* +// CHECK2-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 4 +// CHECK2-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK2-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK2-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK2-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]] +// CHECK2-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK2-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1 +// CHECK2-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0 +// CHECK2-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]] +// CHECK2-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK2-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]] +// CHECK2-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]] +// CHECK2-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]] +// CHECK2-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK2: then: +// CHECK2-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* +// CHECK2-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK2-NEXT: call void @"_omp$reduction$reduction_func11"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]] +// CHECK2-NEXT: br label [[IFCONT:%.*]] +// CHECK2: else: +// CHECK2-NEXT: br label [[IFCONT]] +// CHECK2: ifcont: +// CHECK2-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK2-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK2-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]] +// CHECK2-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]] +// CHECK2: then6: +// CHECK2-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 4 +// CHECK2-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4 +// CHECK2-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32* +// CHECK2-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32* +// CHECK2-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4 +// CHECK2-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4 +// CHECK2-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 4 +// CHECK2-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4 +// CHECK2-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16* +// CHECK2-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16* +// CHECK2-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2 +// CHECK2-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2 +// CHECK2-NEXT: br label [[IFCONT8:%.*]] +// CHECK2: else7: +// CHECK2-NEXT: br label [[IFCONT8]] +// CHECK2: ifcont8: +// CHECK2-NEXT: ret void +// // +// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func13 +// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG8]] +// CHECK2-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG8]] +// CHECK2-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK2-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG8]] +// CHECK2-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4:[0-9]+]], i32 [[TMP2]]) +// CHECK2-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK2-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK2: then: +// CHECK2-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 +// CHECK2-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* +// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK2-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4 +// CHECK2-NEXT: br label [[IFCONT:%.*]] +// CHECK2: else: +// CHECK2-NEXT: br label [[IFCONT]] +// CHECK2: ifcont: +// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] +// CHECK2-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK2: then4: +// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK2-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 +// CHECK2-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32* +// CHECK2-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4 +// CHECK2-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4 +// CHECK2-NEXT: br label [[IFCONT6:%.*]] +// CHECK2: else5: +// CHECK2-NEXT: br label [[IFCONT6]] +// CHECK2: ifcont6: +// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK2-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK2-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] +// CHECK2: then8: +// CHECK2-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 +// CHECK2-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16* +// CHECK2-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK2-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)* +// CHECK2-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2 +// CHECK2-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2 +// CHECK2-NEXT: br label [[IFCONT10:%.*]] +// CHECK2: else9: +// CHECK2-NEXT: br label [[IFCONT10]] +// CHECK2: ifcont10: +// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK2-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]] +// CHECK2-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] +// CHECK2: then12: +// CHECK2-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK2-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)* +// CHECK2-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 4 +// CHECK2-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16* +// CHECK2-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2 +// CHECK2-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2 +// CHECK2-NEXT: br label [[IFCONT14:%.*]] +// CHECK2: else13: +// CHECK2-NEXT: br label [[IFCONT14]] +// CHECK2: ifcont14: +// CHECK2-NEXT: ret void // -// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23 -// CHECK6-SAME: (double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1:[0-9]+]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[E_ADDR:%.*]] = alloca double*, align 4 -// CHECK6-NEXT: [[E7:%.*]] = alloca double, align 8 -// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK6-NEXT: store double* [[E]], double** [[E_ADDR]], align 4 -// CHECK6-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4 -// CHECK6-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK6-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK6-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] -// CHECK6-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] -// CHECK6-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] -// CHECK6: .worker: -// CHECK6-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_worker() #[[ATTR3:[0-9]+]] -// CHECK6-NEXT: br label [[DOTEXIT:%.*]] -// CHECK6: .mastercheck: -// CHECK6-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK6-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK6-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK6-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 -// CHECK6-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 -// CHECK6-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1 -// CHECK6-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]] -// CHECK6-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] -// CHECK6-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] -// CHECK6: .master: -// CHECK6-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK6-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK6-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] -// CHECK6-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK6-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK6-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK6-NEXT: [[TMP7:%.*]] = load double, double* [[TMP0]], align 8 -// CHECK6-NEXT: store double [[TMP7]], double* [[E7]], align 8 -// CHECK6-NEXT: store i32 [[TMP6]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK6-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], double* [[E7]]) #[[ATTR3]] -// CHECK6-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] -// CHECK6: .termination.notifier: -// CHECK6-NEXT: call void @__kmpc_kernel_deinit(i16 1) -// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK6-NEXT: br label [[DOTEXIT]] -// CHECK6: .exit: -// CHECK6-NEXT: ret void +// +// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func15 +// CHECK2-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK2-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK2-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK2-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK2-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* +// CHECK2-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK2-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK2-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 +// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* +// CHECK2-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 +// CHECK2-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* +// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 +// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10]] +// CHECK2-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK2-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]]) +// CHECK2-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 +// CHECK2-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 +// CHECK2-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 +// CHECK2-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* +// CHECK2-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 4 +// CHECK2-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 4 +// CHECK2-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16* +// CHECK2-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 +// CHECK2-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8* +// CHECK2-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2 +// CHECK2-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32 +// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10]] +// CHECK2-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 +// CHECK2-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) +// CHECK2-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 +// CHECK2-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 +// CHECK2-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 +// CHECK2-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1 +// CHECK2-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* +// CHECK2-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 4 +// CHECK2-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK2-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK2-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK2-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]] +// CHECK2-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK2-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1 +// CHECK2-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0 +// CHECK2-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]] +// CHECK2-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK2-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]] +// CHECK2-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]] +// CHECK2-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]] +// CHECK2-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK2: then: +// CHECK2-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* +// CHECK2-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK2-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]] +// CHECK2-NEXT: br label [[IFCONT:%.*]] +// CHECK2: else: +// CHECK2-NEXT: br label [[IFCONT]] +// CHECK2: ifcont: +// CHECK2-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK2-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK2-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]] +// CHECK2-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]] +// CHECK2: then6: +// CHECK2-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 4 +// CHECK2-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4 +// CHECK2-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32* +// CHECK2-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32* +// CHECK2-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4 +// CHECK2-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4 +// CHECK2-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 4 +// CHECK2-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4 +// CHECK2-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16* +// CHECK2-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16* +// CHECK2-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2 +// CHECK2-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2 +// CHECK2-NEXT: br label [[IFCONT8:%.*]] +// CHECK2: else7: +// CHECK2-NEXT: br label [[IFCONT8]] +// CHECK2: ifcont8: +// CHECK2-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[E_ADDR:%.*]] = alloca double*, align 4 -// CHECK6-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 -// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK6-NEXT: store double* [[E]], double** [[E_ADDR]], align 4 -// CHECK6-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4 -// CHECK6-NEXT: [[TMP1:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 8, i16 1) -// CHECK6-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to %struct._globalized_locals_ty* -// CHECK6-NEXT: [[E1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP2]], i32 0, i32 0 -// CHECK6-NEXT: store double 0.000000e+00, double* [[E1]], align 8 -// CHECK6-NEXT: [[TMP3:%.*]] = load double, double* [[E1]], align 8 -// CHECK6-NEXT: [[ADD:%.*]] = fadd double [[TMP3]], 5.000000e+00 -// CHECK6-NEXT: store double [[ADD]], double* [[E1]], align 8 -// CHECK6-NEXT: [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4 -// CHECK6-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP7:%.*]] = bitcast double* [[E1]] to i8* -// CHECK6-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 -// CHECK6-NEXT: [[TMP8:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK6-NEXT: [[TMP9:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 -// CHECK6-NEXT: [[TMP10:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i8* [[TMP9]], i32 2048, i8* [[TMP8]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func) -// CHECK6-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP10]], 1 -// CHECK6-NEXT: br i1 [[TMP11]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] -// CHECK6: .omp.reduction.then: -// CHECK6-NEXT: [[TMP12:%.*]] = load double, double* [[TMP0]], align 8 -// CHECK6-NEXT: [[TMP13:%.*]] = load double, double* [[E1]], align 8 -// CHECK6-NEXT: [[ADD2:%.*]] = fadd double [[TMP12]], [[TMP13]] -// CHECK6-NEXT: store double [[ADD2]], double* [[TMP0]], align 8 -// CHECK6-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]]) -// CHECK6-NEXT: br label [[DOTOMP_REDUCTION_DONE]] -// CHECK6: .omp.reduction.done: -// CHECK6-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP1]]) -// CHECK6-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func16 +// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG8]] +// CHECK2-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG8]] +// CHECK2-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK2-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG8]] +// CHECK2-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK2-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK2-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK2: then: +// CHECK2-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 +// CHECK2-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* +// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK2-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4 +// CHECK2-NEXT: br label [[IFCONT:%.*]] +// CHECK2: else: +// CHECK2-NEXT: br label [[IFCONT]] +// CHECK2: ifcont: +// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] +// CHECK2-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK2: then4: +// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK2-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 +// CHECK2-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32* +// CHECK2-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4 +// CHECK2-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4 +// CHECK2-NEXT: br label [[IFCONT6:%.*]] +// CHECK2: else5: +// CHECK2-NEXT: br label [[IFCONT6]] +// CHECK2: ifcont6: +// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK2-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK2-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] +// CHECK2: then8: +// CHECK2-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 +// CHECK2-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16* +// CHECK2-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK2-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)* +// CHECK2-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2 +// CHECK2-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2 +// CHECK2-NEXT: br label [[IFCONT10:%.*]] +// CHECK2: else9: +// CHECK2-NEXT: br label [[IFCONT10]] +// CHECK2: ifcont10: +// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK2-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]] +// CHECK2-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] +// CHECK2: then12: +// CHECK2-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK2-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)* +// CHECK2-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 4 +// CHECK2-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16* +// CHECK2-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2 +// CHECK2-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2 +// CHECK2-NEXT: br label [[IFCONT14:%.*]] +// CHECK2: else13: +// CHECK2-NEXT: br label [[IFCONT14]] +// CHECK2: ifcont14: +// CHECK2-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func -// CHECK6-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 -// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 -// CHECK6-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 -// CHECK6-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x i8*], align 4 -// CHECK6-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca double, align 8 -// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 -// CHECK6-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 -// CHECK6-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 -// CHECK6-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [1 x i8*]* -// CHECK6-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 -// CHECK6-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 -// CHECK6-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 -// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 -// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to double* -// CHECK6-NEXT: [[TMP13:%.*]] = getelementptr double, double* [[TMP12]], i32 1 -// CHECK6-NEXT: [[TMP14:%.*]] = bitcast double* [[TMP13]] to i8* -// CHECK6-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP12]] to i64* -// CHECK6-NEXT: [[TMP16:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i64* -// CHECK6-NEXT: [[TMP17:%.*]] = load i64, i64* [[TMP15]], align 8 -// CHECK6-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK6-NEXT: [[TMP18:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 -// CHECK6-NEXT: [[TMP19:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP17]], i16 [[TMP7]], i16 [[TMP18]]) -// CHECK6-NEXT: store i64 [[TMP19]], i64* [[TMP16]], align 8 -// CHECK6-NEXT: [[TMP20:%.*]] = getelementptr i64, i64* [[TMP15]], i32 1 -// CHECK6-NEXT: [[TMP21:%.*]] = getelementptr i64, i64* [[TMP16]], i32 1 -// CHECK6-NEXT: [[TMP22:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i8* -// CHECK6-NEXT: store i8* [[TMP22]], i8** [[TMP11]], align 4 -// CHECK6-NEXT: [[TMP23:%.*]] = icmp eq i16 [[TMP8]], 0 -// CHECK6-NEXT: [[TMP24:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK6-NEXT: [[TMP25:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] -// CHECK6-NEXT: [[TMP26:%.*]] = and i1 [[TMP24]], [[TMP25]] -// CHECK6-NEXT: [[TMP27:%.*]] = icmp eq i16 [[TMP8]], 2 -// CHECK6-NEXT: [[TMP28:%.*]] = and i16 [[TMP6]], 1 -// CHECK6-NEXT: [[TMP29:%.*]] = icmp eq i16 [[TMP28]], 0 -// CHECK6-NEXT: [[TMP30:%.*]] = and i1 [[TMP27]], [[TMP29]] -// CHECK6-NEXT: [[TMP31:%.*]] = icmp sgt i16 [[TMP7]], 0 -// CHECK6-NEXT: [[TMP32:%.*]] = and i1 [[TMP30]], [[TMP31]] -// CHECK6-NEXT: [[TMP33:%.*]] = or i1 [[TMP23]], [[TMP26]] -// CHECK6-NEXT: [[TMP34:%.*]] = or i1 [[TMP33]], [[TMP32]] -// CHECK6-NEXT: br i1 [[TMP34]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK6: then: -// CHECK6-NEXT: [[TMP35:%.*]] = bitcast [1 x i8*]* [[TMP5]] to i8* -// CHECK6-NEXT: [[TMP36:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* -// CHECK6-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP35]], i8* [[TMP36]]) #[[ATTR3]] -// CHECK6-NEXT: br label [[IFCONT:%.*]] -// CHECK6: else: -// CHECK6-NEXT: br label [[IFCONT]] -// CHECK6: ifcont: -// CHECK6-NEXT: [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK6-NEXT: [[TMP38:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] -// CHECK6-NEXT: [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]] -// CHECK6-NEXT: br i1 [[TMP39]], label [[THEN4:%.*]], label [[ELSE5:%.*]] -// CHECK6: then4: -// CHECK6-NEXT: [[TMP40:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP41:%.*]] = load i8*, i8** [[TMP40]], align 4 -// CHECK6-NEXT: [[TMP42:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP43:%.*]] = load i8*, i8** [[TMP42]], align 4 -// CHECK6-NEXT: [[TMP44:%.*]] = bitcast i8* [[TMP41]] to double* -// CHECK6-NEXT: [[TMP45:%.*]] = bitcast i8* [[TMP43]] to double* -// CHECK6-NEXT: [[TMP46:%.*]] = load double, double* [[TMP44]], align 8 -// CHECK6-NEXT: store double [[TMP46]], double* [[TMP45]], align 8 -// CHECK6-NEXT: br label [[IFCONT6:%.*]] -// CHECK6: else5: -// CHECK6-NEXT: br label [[IFCONT6]] -// CHECK6: ifcont6: -// CHECK6-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func17 +// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK2-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.1* +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK2-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* +// CHECK2-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP7]] +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK2-NEXT: store i32 [[TMP12]], i32* [[TMP11]], align 128 +// CHECK2-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 +// CHECK2-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16* +// CHECK2-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP16:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP7]] +// CHECK2-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP15]], align 2 +// CHECK2-NEXT: store i16 [[TMP17]], i16* [[TMP16]], align 128 +// CHECK2-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func -// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTCNT_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK6-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK6-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 -// CHECK6-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK6-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 -// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* -// CHECK6-NEXT: store i32 0, i32* [[DOTCNT_ADDR]], align 4 -// CHECK6-NEXT: br label [[PRECOND:%.*]] -// CHECK6: precond: -// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCNT_ADDR]], align 4 -// CHECK6-NEXT: [[TMP6:%.*]] = icmp ult i32 [[TMP5]], 2 -// CHECK6-NEXT: br i1 [[TMP6]], label [[BODY:%.*]], label [[EXIT:%.*]] -// CHECK6: body: -// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP2]]) -// CHECK6-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK6-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK6: then: -// CHECK6-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP8:%.*]] = load i8*, i8** [[TMP7]], align 4 -// CHECK6-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32* -// CHECK6-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[TMP9]], i32 [[TMP5]] -// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 -// CHECK6-NEXT: store volatile i32 [[TMP12]], i32 addrspace(3)* [[TMP11]], align 4 -// CHECK6-NEXT: br label [[IFCONT:%.*]] -// CHECK6: else: -// CHECK6-NEXT: br label [[IFCONT]] -// CHECK6: ifcont: -// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) -// CHECK6-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP13]] -// CHECK6-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] -// CHECK6: then4: -// CHECK6-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK6-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 4 -// CHECK6-NEXT: [[TMP17:%.*]] = bitcast i8* [[TMP16]] to i32* -// CHECK6-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP17]], i32 [[TMP5]] -// CHECK6-NEXT: [[TMP19:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4 -// CHECK6-NEXT: store i32 [[TMP19]], i32* [[TMP18]], align 4 -// CHECK6-NEXT: br label [[IFCONT6:%.*]] -// CHECK6: else5: -// CHECK6-NEXT: br label [[IFCONT6]] -// CHECK6: ifcont6: -// CHECK6-NEXT: [[TMP20:%.*]] = add nsw i32 [[TMP5]], 1 -// CHECK6-NEXT: store i32 [[TMP20]], i32* [[DOTCNT_ADDR]], align 4 -// CHECK6-NEXT: br label [[PRECOND]] -// CHECK6: exit: -// CHECK6-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func18 +// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.1* +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK2-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP4]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP5]] +// CHECK2-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* +// CHECK2-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 +// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK2-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP4]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP5]] +// CHECK2-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8* +// CHECK2-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 4 +// CHECK2-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK2-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK2-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP12]], i8* [[TMP13]]) #[[ATTR3]] +// CHECK2-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func -// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* -// CHECK6-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0* -// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 -// CHECK6-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double* -// CHECK6-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP7]] -// CHECK6-NEXT: [[TMP12:%.*]] = load double, double* [[TMP10]], align 8 -// CHECK6-NEXT: store double [[TMP12]], double* [[TMP11]], align 128 -// CHECK6-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func19 +// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK2-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.1* +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK2-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* +// CHECK2-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP7]] +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 128 +// CHECK2-NEXT: store i32 [[TMP12]], i32* [[TMP10]], align 4 +// CHECK2-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 +// CHECK2-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16* +// CHECK2-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP16:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP7]] +// CHECK2-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP16]], align 128 +// CHECK2-NEXT: store i16 [[TMP17]], i16* [[TMP15]], align 2 +// CHECK2-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func -// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 -// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0* -// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK6-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP5]] -// CHECK6-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8* -// CHECK6-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 -// CHECK6-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK6-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK6-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP9]], i8* [[TMP10]]) #[[ATTR3]] -// CHECK6-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func20 +// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.1* +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK2-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP4]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP5]] +// CHECK2-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* +// CHECK2-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 +// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK2-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP4]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP5]] +// CHECK2-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8* +// CHECK2-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 4 +// CHECK2-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK2-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK2-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP13]], i8* [[TMP12]]) #[[ATTR3]] +// CHECK2-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func -// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* -// CHECK6-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0* -// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 -// CHECK6-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double* -// CHECK6-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP7]] -// CHECK6-NEXT: [[TMP12:%.*]] = load double, double* [[TMP11]], align 128 -// CHECK6-NEXT: store double [[TMP12]], double* [[TMP10]], align 8 -// CHECK6-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l21_worker +// CHECK3-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK3-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK3-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK3: .await.work: +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK3-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK3-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK3-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK3-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK3: .select.workers: +// CHECK3-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK3-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK3: .execute.parallel: +// CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK3-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK3: .terminate.parallel: +// CHECK3-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK3-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK3: .barrier.parallel: +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func -// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 -// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0* -// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK6-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP5]] -// CHECK6-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8* -// CHECK6-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 -// CHECK6-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK6-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK6-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP10]], i8* [[TMP9]]) #[[ATTR3]] -// CHECK6-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l21 +// CHECK3-SAME: (double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1:[0-9]+]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[E_ADDR:%.*]] = alloca double*, align 4 +// CHECK3-NEXT: [[E7:%.*]] = alloca double, align 8 +// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK3-NEXT: store double* [[E]], double** [[E_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4 +// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG8:![0-9]+]] +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG9:![0-9]+]] +// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10:![0-9]+]] +// CHECK3-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK3-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK3-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK3: .worker: +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l21_worker() #[[ATTR3:[0-9]+]] +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .mastercheck: +// CHECK3-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG8]] +// CHECK3-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG9]] +// CHECK3-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10]] +// CHECK3-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK3-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK3-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1 +// CHECK3-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]] +// CHECK3-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK3-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK3: .master: +// CHECK3-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG9]] +// CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10]] +// CHECK3-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK3-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK3-NEXT: [[TMP7:%.*]] = load double, double* [[TMP0]], align 8 +// CHECK3-NEXT: store double [[TMP7]], double* [[E7]], align 8 +// CHECK3-NEXT: store i32 [[TMP6]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK3-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], double* [[E7]]) #[[ATTR3]] +// CHECK3-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK3: .termination.notifier: +// CHECK3-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: br label [[DOTEXIT]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29_worker -// CHECK6-SAME: () #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 -// CHECK6-NEXT: store i8* null, i8** [[WORK_FN]], align 4 -// CHECK6-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 -// CHECK6-NEXT: br label [[DOTAWAIT_WORK:%.*]] -// CHECK6: .await.work: -// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK6-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) -// CHECK6-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 -// CHECK6-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 -// CHECK6-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 -// CHECK6-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null -// CHECK6-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] -// CHECK6: .select.workers: -// CHECK6-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 -// CHECK6-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK6-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] -// CHECK6: .execute.parallel: -// CHECK6-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK6-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* -// CHECK6-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) -// CHECK6-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] -// CHECK6: .terminate.parallel: -// CHECK6-NEXT: call void @__kmpc_kernel_end_parallel() -// CHECK6-NEXT: br label [[DOTBARRIER_PARALLEL]] -// CHECK6: .barrier.parallel: -// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK6-NEXT: br label [[DOTAWAIT_WORK]] -// CHECK6: .exit: -// CHECK6-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[E_ADDR:%.*]] = alloca double*, align 4 +// CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store double* [[E]], double** [[E_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4 +// CHECK3-NEXT: [[E1:%.*]] = call i8* @__kmpc_alloc_shared(i32 8) +// CHECK3-NEXT: [[E_ON_STACK:%.*]] = bitcast i8* [[E1]] to double* +// CHECK3-NEXT: store double 0.000000e+00, double* [[E_ON_STACK]], align 8 +// CHECK3-NEXT: [[TMP1:%.*]] = load double, double* [[E_ON_STACK]], align 8 +// CHECK3-NEXT: [[ADD:%.*]] = fadd double [[TMP1]], 5.000000e+00 +// CHECK3-NEXT: store double [[ADD]], double* [[E_ON_STACK]], align 8 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP5:%.*]] = bitcast double* [[E_ON_STACK]] to i8* +// CHECK3-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK3-NEXT: [[TMP7:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP3]], i8* [[TMP7]], i32 2048, i8* [[TMP6]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func) +// CHECK3-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 1 +// CHECK3-NEXT: br i1 [[TMP9]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK3: .omp.reduction.then: +// CHECK3-NEXT: [[TMP10:%.*]] = load double, double* [[TMP0]], align 8 +// CHECK3-NEXT: [[TMP11:%.*]] = load double, double* [[E_ON_STACK]], align 8 +// CHECK3-NEXT: [[ADD2:%.*]] = fadd double [[TMP10]], [[TMP11]] +// CHECK3-NEXT: store double [[ADD2]], double* [[TMP0]], align 8 +// CHECK3-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP3]]) +// CHECK3-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK3: .omp.reduction.done: +// CHECK3-NEXT: call void @__kmpc_free_shared(i8* [[E1]]) +// CHECK3-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29 -// CHECK6-SAME: (i32 [[C:%.*]], i32 [[D:%.*]]) #[[ATTR1]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[C_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[D_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[C]], i32* [[C_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[D]], i32* [[D_ADDR]], align 4 -// CHECK6-NEXT: [[CONV:%.*]] = bitcast i32* [[C_ADDR]] to i8* -// CHECK6-NEXT: [[CONV1:%.*]] = bitcast i32* [[D_ADDR]] to float* -// CHECK6-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK6-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK6-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] -// CHECK6-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] -// CHECK6-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] -// CHECK6: .worker: -// CHECK6-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29_worker() #[[ATTR3]] -// CHECK6-NEXT: br label [[DOTEXIT:%.*]] -// CHECK6: .mastercheck: -// CHECK6-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK6-NEXT: [[NVPTX_NUM_THREADS3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK6-NEXT: [[NVPTX_WARP_SIZE4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK6-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE4]], 1 -// CHECK6-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS3]], 1 -// CHECK6-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 -// CHECK6-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] -// CHECK6-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID2]], [[MASTER_TID]] -// CHECK6-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] -// CHECK6: .master: -// CHECK6-NEXT: [[NVPTX_NUM_THREADS5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK6-NEXT: [[NVPTX_WARP_SIZE6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK6-NEXT: [[THREAD_LIMIT7:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS5]], [[NVPTX_WARP_SIZE6]] -// CHECK6-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT7]], i16 1) -// CHECK6-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK6-NEXT: [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 8, i16 1) -// CHECK6-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.1* -// CHECK6-NEXT: [[TMP7:%.*]] = load i8, i8* [[CONV]], align 4 -// CHECK6-NEXT: [[C8:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 1 -// CHECK6-NEXT: store i8 [[TMP7]], i8* [[C8]], align 4 -// CHECK6-NEXT: [[TMP8:%.*]] = load float, float* [[CONV1]], align 4 -// CHECK6-NEXT: [[D9:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 0 -// CHECK6-NEXT: store float [[TMP8]], float* [[D9]], align 4 -// CHECK6-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK6-NEXT: store i32 [[TMP9]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK6-NEXT: call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8* [[C8]], float* [[D9]]) #[[ATTR3]] -// CHECK6-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]]) -// CHECK6-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] -// CHECK6: .termination.notifier: -// CHECK6-NEXT: call void @__kmpc_kernel_deinit(i16 1) -// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK6-NEXT: br label [[DOTEXIT]] -// CHECK6: .exit: -// CHECK6-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func +// CHECK3-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK3-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK3-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x i8*], align 4 +// CHECK3-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca double, align 8 +// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK3-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK3-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK3-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [1 x i8*]* +// CHECK3-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK3-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK3-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to double* +// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr double, double* [[TMP12]], i32 1 +// CHECK3-NEXT: [[TMP14:%.*]] = bitcast double* [[TMP13]] to i8* +// CHECK3-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP12]] to i64* +// CHECK3-NEXT: [[TMP16:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i64* +// CHECK3-NEXT: [[TMP17:%.*]] = load i64, i64* [[TMP15]], align 8 +// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10]] +// CHECK3-NEXT: [[TMP18:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK3-NEXT: [[TMP19:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP17]], i16 [[TMP7]], i16 [[TMP18]]) +// CHECK3-NEXT: store i64 [[TMP19]], i64* [[TMP16]], align 8 +// CHECK3-NEXT: [[TMP20:%.*]] = getelementptr i64, i64* [[TMP15]], i32 1 +// CHECK3-NEXT: [[TMP21:%.*]] = getelementptr i64, i64* [[TMP16]], i32 1 +// CHECK3-NEXT: [[TMP22:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i8* +// CHECK3-NEXT: store i8* [[TMP22]], i8** [[TMP11]], align 4 +// CHECK3-NEXT: [[TMP23:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK3-NEXT: [[TMP24:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK3-NEXT: [[TMP25:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK3-NEXT: [[TMP26:%.*]] = and i1 [[TMP24]], [[TMP25]] +// CHECK3-NEXT: [[TMP27:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK3-NEXT: [[TMP28:%.*]] = and i16 [[TMP6]], 1 +// CHECK3-NEXT: [[TMP29:%.*]] = icmp eq i16 [[TMP28]], 0 +// CHECK3-NEXT: [[TMP30:%.*]] = and i1 [[TMP27]], [[TMP29]] +// CHECK3-NEXT: [[TMP31:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK3-NEXT: [[TMP32:%.*]] = and i1 [[TMP30]], [[TMP31]] +// CHECK3-NEXT: [[TMP33:%.*]] = or i1 [[TMP23]], [[TMP26]] +// CHECK3-NEXT: [[TMP34:%.*]] = or i1 [[TMP33]], [[TMP32]] +// CHECK3-NEXT: br i1 [[TMP34]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK3: then: +// CHECK3-NEXT: [[TMP35:%.*]] = bitcast [1 x i8*]* [[TMP5]] to i8* +// CHECK3-NEXT: [[TMP36:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK3-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP35]], i8* [[TMP36]]) #[[ATTR3]] +// CHECK3-NEXT: br label [[IFCONT:%.*]] +// CHECK3: else: +// CHECK3-NEXT: br label [[IFCONT]] +// CHECK3: ifcont: +// CHECK3-NEXT: [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK3-NEXT: [[TMP38:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK3-NEXT: [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]] +// CHECK3-NEXT: br i1 [[TMP39]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK3: then4: +// CHECK3-NEXT: [[TMP40:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP41:%.*]] = load i8*, i8** [[TMP40]], align 4 +// CHECK3-NEXT: [[TMP42:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP43:%.*]] = load i8*, i8** [[TMP42]], align 4 +// CHECK3-NEXT: [[TMP44:%.*]] = bitcast i8* [[TMP41]] to double* +// CHECK3-NEXT: [[TMP45:%.*]] = bitcast i8* [[TMP43]] to double* +// CHECK3-NEXT: [[TMP46:%.*]] = load double, double* [[TMP44]], align 8 +// CHECK3-NEXT: store double [[TMP46]], double* [[TMP45]], align 8 +// CHECK3-NEXT: br label [[IFCONT6:%.*]] +// CHECK3: else5: +// CHECK3-NEXT: br label [[IFCONT6]] +// CHECK3: ifcont6: +// CHECK3-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8* nonnull align 1 dereferenceable(1) [[C:%.*]], float* nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[C_ADDR:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: [[D_ADDR:%.*]] = alloca float*, align 4 -// CHECK6-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 -// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK6-NEXT: store i8* [[C]], i8** [[C_ADDR]], align 4 -// CHECK6-NEXT: store float* [[D]], float** [[D_ADDR]], align 4 -// CHECK6-NEXT: [[TMP0:%.*]] = load i8*, i8** [[C_ADDR]], align 4 -// CHECK6-NEXT: [[TMP1:%.*]] = load float*, float** [[D_ADDR]], align 4 -// CHECK6-NEXT: [[TMP2:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 8, i16 1) -// CHECK6-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to %struct._globalized_locals_ty.2* -// CHECK6-NEXT: [[C1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], %struct._globalized_locals_ty.2* [[TMP3]], i32 0, i32 1 -// CHECK6-NEXT: [[D2:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], %struct._globalized_locals_ty.2* [[TMP3]], i32 0, i32 0 -// CHECK6-NEXT: store i8 0, i8* [[C1]], align 4 -// CHECK6-NEXT: store float 1.000000e+00, float* [[D2]], align 4 -// CHECK6-NEXT: [[TMP4:%.*]] = load i8, i8* [[C1]], align 4 -// CHECK6-NEXT: [[CONV:%.*]] = sext i8 [[TMP4]] to i32 -// CHECK6-NEXT: [[XOR:%.*]] = xor i32 [[CONV]], 2 -// CHECK6-NEXT: [[CONV3:%.*]] = trunc i32 [[XOR]] to i8 -// CHECK6-NEXT: store i8 [[CONV3]], i8* [[C1]], align 4 -// CHECK6-NEXT: [[TMP5:%.*]] = load float, float* [[D2]], align 4 -// CHECK6-NEXT: [[MUL:%.*]] = fmul float [[TMP5]], 3.300000e+01 -// CHECK6-NEXT: store float [[MUL]], float* [[D2]], align 4 -// CHECK6-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 -// CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK6-NEXT: store i8* [[C1]], i8** [[TMP8]], align 4 -// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP10:%.*]] = bitcast float* [[D2]] to i8* -// CHECK6-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 -// CHECK6-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK6-NEXT: [[TMP12:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 -// CHECK6-NEXT: [[TMP13:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], i8* [[TMP12]], i32 2048, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func3, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func4, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func5, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func6, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func7, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func8) -// CHECK6-NEXT: [[TMP14:%.*]] = icmp eq i32 [[TMP13]], 1 -// CHECK6-NEXT: br i1 [[TMP14]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] -// CHECK6: .omp.reduction.then: -// CHECK6-NEXT: [[TMP15:%.*]] = load i8, i8* [[TMP0]], align 1 -// CHECK6-NEXT: [[CONV4:%.*]] = sext i8 [[TMP15]] to i32 -// CHECK6-NEXT: [[TMP16:%.*]] = load i8, i8* [[C1]], align 4 -// CHECK6-NEXT: [[CONV5:%.*]] = sext i8 [[TMP16]] to i32 -// CHECK6-NEXT: [[XOR6:%.*]] = xor i32 [[CONV4]], [[CONV5]] -// CHECK6-NEXT: [[CONV7:%.*]] = trunc i32 [[XOR6]] to i8 -// CHECK6-NEXT: store i8 [[CONV7]], i8* [[TMP0]], align 1 -// CHECK6-NEXT: [[TMP17:%.*]] = load float, float* [[TMP1]], align 4 -// CHECK6-NEXT: [[TMP18:%.*]] = load float, float* [[D2]], align 4 -// CHECK6-NEXT: [[MUL8:%.*]] = fmul float [[TMP17]], [[TMP18]] -// CHECK6-NEXT: store float [[MUL8]], float* [[TMP1]], align 4 -// CHECK6-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP7]]) -// CHECK6-NEXT: br label [[DOTOMP_REDUCTION_DONE]] -// CHECK6: .omp.reduction.done: -// CHECK6-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP2]]) -// CHECK6-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func +// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCNT_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG8]] +// CHECK3-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG8]] +// CHECK3-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK3-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG8]] +// CHECK3-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* +// CHECK3-NEXT: store i32 0, i32* [[DOTCNT_ADDR]], align 4 +// CHECK3-NEXT: br label [[PRECOND:%.*]] +// CHECK3: precond: +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCNT_ADDR]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = icmp ult i32 [[TMP5]], 2 +// CHECK3-NEXT: br i1 [[TMP6]], label [[BODY:%.*]], label [[EXIT:%.*]] +// CHECK3: body: +// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP2]]) +// CHECK3-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK3-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK3: then: +// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP8:%.*]] = load i8*, i8** [[TMP7]], align 4 +// CHECK3-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32* +// CHECK3-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[TMP9]], i32 [[TMP5]] +// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK3-NEXT: store volatile i32 [[TMP12]], i32 addrspace(3)* [[TMP11]], align 4 +// CHECK3-NEXT: br label [[IFCONT:%.*]] +// CHECK3: else: +// CHECK3-NEXT: br label [[IFCONT]] +// CHECK3: ifcont: +// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP13]] +// CHECK3-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK3: then4: +// CHECK3-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK3-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 4 +// CHECK3-NEXT: [[TMP17:%.*]] = bitcast i8* [[TMP16]] to i32* +// CHECK3-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP17]], i32 [[TMP5]] +// CHECK3-NEXT: [[TMP19:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4 +// CHECK3-NEXT: store i32 [[TMP19]], i32* [[TMP18]], align 4 +// CHECK3-NEXT: br label [[IFCONT6:%.*]] +// CHECK3: else5: +// CHECK3-NEXT: br label [[IFCONT6]] +// CHECK3: ifcont6: +// CHECK3-NEXT: [[TMP20:%.*]] = add nsw i32 [[TMP5]], 1 +// CHECK3-NEXT: store i32 [[TMP20]], i32* [[DOTCNT_ADDR]], align 4 +// CHECK3-NEXT: br label [[PRECOND]] +// CHECK3: exit: +// CHECK3-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func3 -// CHECK6-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 -// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 -// CHECK6-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 -// CHECK6-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 -// CHECK6-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i8, align 1 -// CHECK6-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca float, align 4 -// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 -// CHECK6-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 -// CHECK6-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 -// CHECK6-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* -// CHECK6-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 -// CHECK6-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 -// CHECK6-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 -// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 -// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1 -// CHECK6-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP10]], align 1 -// CHECK6-NEXT: [[TMP14:%.*]] = sext i8 [[TMP13]] to i32 -// CHECK6-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK6-NEXT: [[TMP15:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 -// CHECK6-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP14]], i16 [[TMP7]], i16 [[TMP15]]) -// CHECK6-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8 -// CHECK6-NEXT: store i8 [[TMP17]], i8* [[DOTOMP_REDUCTION_ELEMENT]], align 1 -// CHECK6-NEXT: [[TMP18:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1 -// CHECK6-NEXT: [[TMP19:%.*]] = getelementptr i8, i8* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 -// CHECK6-NEXT: store i8* [[DOTOMP_REDUCTION_ELEMENT]], i8** [[TMP11]], align 4 -// CHECK6-NEXT: [[TMP20:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP21:%.*]] = load i8*, i8** [[TMP20]], align 4 -// CHECK6-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP23:%.*]] = bitcast i8* [[TMP21]] to float* -// CHECK6-NEXT: [[TMP24:%.*]] = getelementptr float, float* [[TMP23]], i32 1 -// CHECK6-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to i8* -// CHECK6-NEXT: [[TMP26:%.*]] = bitcast float* [[TMP23]] to i32* -// CHECK6-NEXT: [[TMP27:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i32* -// CHECK6-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP26]], align 4 -// CHECK6-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK6-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 -// CHECK6-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) -// CHECK6-NEXT: store i32 [[TMP30]], i32* [[TMP27]], align 4 -// CHECK6-NEXT: [[TMP31:%.*]] = getelementptr i32, i32* [[TMP26]], i32 1 -// CHECK6-NEXT: [[TMP32:%.*]] = getelementptr i32, i32* [[TMP27]], i32 1 -// CHECK6-NEXT: [[TMP33:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* -// CHECK6-NEXT: store i8* [[TMP33]], i8** [[TMP22]], align 4 -// CHECK6-NEXT: [[TMP34:%.*]] = icmp eq i16 [[TMP8]], 0 -// CHECK6-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK6-NEXT: [[TMP36:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] -// CHECK6-NEXT: [[TMP37:%.*]] = and i1 [[TMP35]], [[TMP36]] -// CHECK6-NEXT: [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 2 -// CHECK6-NEXT: [[TMP39:%.*]] = and i16 [[TMP6]], 1 -// CHECK6-NEXT: [[TMP40:%.*]] = icmp eq i16 [[TMP39]], 0 -// CHECK6-NEXT: [[TMP41:%.*]] = and i1 [[TMP38]], [[TMP40]] -// CHECK6-NEXT: [[TMP42:%.*]] = icmp sgt i16 [[TMP7]], 0 -// CHECK6-NEXT: [[TMP43:%.*]] = and i1 [[TMP41]], [[TMP42]] -// CHECK6-NEXT: [[TMP44:%.*]] = or i1 [[TMP34]], [[TMP37]] -// CHECK6-NEXT: [[TMP45:%.*]] = or i1 [[TMP44]], [[TMP43]] -// CHECK6-NEXT: br i1 [[TMP45]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK6: then: -// CHECK6-NEXT: [[TMP46:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* -// CHECK6-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* -// CHECK6-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP46]], i8* [[TMP47]]) #[[ATTR3]] -// CHECK6-NEXT: br label [[IFCONT:%.*]] -// CHECK6: else: -// CHECK6-NEXT: br label [[IFCONT]] -// CHECK6: ifcont: -// CHECK6-NEXT: [[TMP48:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK6-NEXT: [[TMP49:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] -// CHECK6-NEXT: [[TMP50:%.*]] = and i1 [[TMP48]], [[TMP49]] -// CHECK6-NEXT: br i1 [[TMP50]], label [[THEN6:%.*]], label [[ELSE7:%.*]] -// CHECK6: then6: -// CHECK6-NEXT: [[TMP51:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP52:%.*]] = load i8*, i8** [[TMP51]], align 4 -// CHECK6-NEXT: [[TMP53:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP54:%.*]] = load i8*, i8** [[TMP53]], align 4 -// CHECK6-NEXT: [[TMP55:%.*]] = load i8, i8* [[TMP52]], align 1 -// CHECK6-NEXT: store i8 [[TMP55]], i8* [[TMP54]], align 1 -// CHECK6-NEXT: [[TMP56:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP57:%.*]] = load i8*, i8** [[TMP56]], align 4 -// CHECK6-NEXT: [[TMP58:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP59:%.*]] = load i8*, i8** [[TMP58]], align 4 -// CHECK6-NEXT: [[TMP60:%.*]] = bitcast i8* [[TMP57]] to float* -// CHECK6-NEXT: [[TMP61:%.*]] = bitcast i8* [[TMP59]] to float* -// CHECK6-NEXT: [[TMP62:%.*]] = load float, float* [[TMP60]], align 4 -// CHECK6-NEXT: store float [[TMP62]], float* [[TMP61]], align 4 -// CHECK6-NEXT: br label [[IFCONT8:%.*]] -// CHECK6: else7: -// CHECK6-NEXT: br label [[IFCONT8]] -// CHECK6: ifcont8: -// CHECK6-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func +// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* +// CHECK3-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty* +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK3-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double* +// CHECK3-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP6]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP7]] +// CHECK3-NEXT: [[TMP12:%.*]] = load double, double* [[TMP10]], align 8 +// CHECK3-NEXT: store double [[TMP12]], double* [[TMP11]], align 128 +// CHECK3-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func4 -// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK6-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK6-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 -// CHECK6-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK6-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 -// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) -// CHECK6-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK6-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK6: then: -// CHECK6-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 -// CHECK6-NEXT: [[TMP7:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK6-NEXT: [[TMP8:%.*]] = bitcast i32 addrspace(3)* [[TMP7]] to i8 addrspace(3)* -// CHECK6-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP6]], align 1 -// CHECK6-NEXT: store volatile i8 [[TMP9]], i8 addrspace(3)* [[TMP8]], align 1 -// CHECK6-NEXT: br label [[IFCONT:%.*]] -// CHECK6: else: -// CHECK6-NEXT: br label [[IFCONT]] -// CHECK6: ifcont: -// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) -// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] -// CHECK6-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] -// CHECK6: then4: -// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK6-NEXT: [[TMP12:%.*]] = bitcast i32 addrspace(3)* [[TMP11]] to i8 addrspace(3)* -// CHECK6-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 -// CHECK6-NEXT: [[TMP15:%.*]] = load volatile i8, i8 addrspace(3)* [[TMP12]], align 1 -// CHECK6-NEXT: store i8 [[TMP15]], i8* [[TMP14]], align 1 -// CHECK6-NEXT: br label [[IFCONT6:%.*]] -// CHECK6: else5: -// CHECK6-NEXT: br label [[IFCONT6]] -// CHECK6: ifcont6: -// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) -// CHECK6-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK6-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] -// CHECK6: then8: -// CHECK6-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 -// CHECK6-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i32* -// CHECK6-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK6-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP18]], align 4 -// CHECK6-NEXT: store volatile i32 [[TMP20]], i32 addrspace(3)* [[TMP19]], align 4 -// CHECK6-NEXT: br label [[IFCONT10:%.*]] -// CHECK6: else9: -// CHECK6-NEXT: br label [[IFCONT10]] -// CHECK6: ifcont10: -// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) -// CHECK6-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP21]] -// CHECK6-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] -// CHECK6: then12: -// CHECK6-NEXT: [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK6-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP24:%.*]] = load i8*, i8** [[TMP23]], align 4 -// CHECK6-NEXT: [[TMP25:%.*]] = bitcast i8* [[TMP24]] to i32* -// CHECK6-NEXT: [[TMP26:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP22]], align 4 -// CHECK6-NEXT: store i32 [[TMP26]], i32* [[TMP25]], align 4 -// CHECK6-NEXT: br label [[IFCONT14:%.*]] -// CHECK6: else13: -// CHECK6-NEXT: br label [[IFCONT14]] -// CHECK6: ifcont14: -// CHECK6-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func +// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 +// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty* +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK3-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP4]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP5]] +// CHECK3-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8* +// CHECK3-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 +// CHECK3-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK3-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK3-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP9]], i8* [[TMP10]]) #[[ATTR3]] +// CHECK3-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func5 -// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK6-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.3* -// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 -// CHECK6-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP7]] -// CHECK6-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]], align 1 -// CHECK6-NEXT: store i8 [[TMP11]], i8* [[TMP10]], align 128 -// CHECK6-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 -// CHECK6-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float* -// CHECK6-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP15:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP7]] -// CHECK6-NEXT: [[TMP16:%.*]] = load float, float* [[TMP14]], align 4 -// CHECK6-NEXT: store float [[TMP16]], float* [[TMP15]], align 128 -// CHECK6-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func +// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* +// CHECK3-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty* +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK3-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double* +// CHECK3-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP6]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP7]] +// CHECK3-NEXT: [[TMP12:%.*]] = load double, double* [[TMP11]], align 128 +// CHECK3-NEXT: store double [[TMP12]], double* [[TMP10]], align 8 +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func +// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 +// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty* +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK3-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP4]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP5]] +// CHECK3-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8* +// CHECK3-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 +// CHECK3-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK3-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK3-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP10]], i8* [[TMP9]]) #[[ATTR3]] +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l27_worker +// CHECK3-SAME: () #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK3-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK3-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK3: .await.work: +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK3-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK3-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK3-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK3-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK3: .select.workers: +// CHECK3-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK3-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK3: .execute.parallel: +// CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK3-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK3: .terminate.parallel: +// CHECK3-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK3-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK3: .barrier.parallel: +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l27 +// CHECK3-SAME: (i32 [[C:%.*]], i32 [[D:%.*]]) #[[ATTR1]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[D_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[C]], i32* [[C_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[D]], i32* [[D_ADDR]], align 4 +// CHECK3-NEXT: [[CONV:%.*]] = bitcast i32* [[C_ADDR]] to i8* +// CHECK3-NEXT: [[CONV1:%.*]] = bitcast i32* [[D_ADDR]] to float* +// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG8]] +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG9]] +// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10]] +// CHECK3-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK3-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK3-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK3: .worker: +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l27_worker() #[[ATTR3]] +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .mastercheck: +// CHECK3-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG8]] +// CHECK3-NEXT: [[NVPTX_NUM_THREADS3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG9]] +// CHECK3-NEXT: [[NVPTX_WARP_SIZE4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10]] +// CHECK3-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE4]], 1 +// CHECK3-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS3]], 1 +// CHECK3-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK3-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK3-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID2]], [[MASTER_TID]] +// CHECK3-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK3: .master: +// CHECK3-NEXT: [[NVPTX_NUM_THREADS5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG9]] +// CHECK3-NEXT: [[NVPTX_WARP_SIZE6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10]] +// CHECK3-NEXT: [[THREAD_LIMIT7:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS5]], [[NVPTX_WARP_SIZE6]] +// CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT7]], i16 1) +// CHECK3-NEXT: [[TMP5:%.*]] = load i8, i8* [[CONV]], align 4 +// CHECK3-NEXT: [[C8:%.*]] = call i8* @__kmpc_alloc_shared(i32 1) +// CHECK3-NEXT: store i8 [[TMP5]], i8* [[C8]], align 1 +// CHECK3-NEXT: [[TMP6:%.*]] = load float, float* [[CONV1]], align 4 +// CHECK3-NEXT: [[D9:%.*]] = call i8* @__kmpc_alloc_shared(i32 4) +// CHECK3-NEXT: [[D_ON_STACK:%.*]] = bitcast i8* [[D9]] to float* +// CHECK3-NEXT: store float [[TMP6]], float* [[D_ON_STACK]], align 4 +// CHECK3-NEXT: [[TMP7:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK3-NEXT: store i32 [[TMP7]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK3-NEXT: call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8* [[C8]], float* [[D_ON_STACK]]) #[[ATTR3]] +// CHECK3-NEXT: call void @__kmpc_free_shared(i8* [[D9]]) +// CHECK3-NEXT: call void @__kmpc_free_shared(i8* [[C8]]) +// CHECK3-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK3: .termination.notifier: +// CHECK3-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: br label [[DOTEXIT]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8* nonnull align 1 dereferenceable(1) [[C:%.*]], float* nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[D_ADDR:%.*]] = alloca float*, align 4 +// CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i8* [[C]], i8** [[C_ADDR]], align 4 +// CHECK3-NEXT: store float* [[D]], float** [[D_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load i8*, i8** [[C_ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load float*, float** [[D_ADDR]], align 4 +// CHECK3-NEXT: [[C1:%.*]] = call i8* @__kmpc_alloc_shared(i32 1) +// CHECK3-NEXT: [[D2:%.*]] = call i8* @__kmpc_alloc_shared(i32 4) +// CHECK3-NEXT: [[D_ON_STACK:%.*]] = bitcast i8* [[D2]] to float* +// CHECK3-NEXT: store i8 0, i8* [[C1]], align 1 +// CHECK3-NEXT: store float 1.000000e+00, float* [[D_ON_STACK]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i8, i8* [[C1]], align 1 +// CHECK3-NEXT: [[CONV:%.*]] = sext i8 [[TMP2]] to i32 +// CHECK3-NEXT: [[XOR:%.*]] = xor i32 [[CONV]], 2 +// CHECK3-NEXT: [[CONV3:%.*]] = trunc i32 [[XOR]] to i8 +// CHECK3-NEXT: store i8 [[CONV3]], i8* [[C1]], align 1 +// CHECK3-NEXT: [[TMP3:%.*]] = load float, float* [[D_ON_STACK]], align 4 +// CHECK3-NEXT: [[MUL:%.*]] = fmul float [[TMP3]], 3.300000e+01 +// CHECK3-NEXT: store float [[MUL]], float* [[D_ON_STACK]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK3-NEXT: store i8* [[C1]], i8** [[TMP6]], align 4 +// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP8:%.*]] = bitcast float* [[D_ON_STACK]] to i8* +// CHECK3-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 4 +// CHECK3-NEXT: [[TMP9:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK3-NEXT: [[TMP10:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i8* [[TMP10]], i32 2048, i8* [[TMP9]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func3, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func4, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func5, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func6, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func7, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func8) +// CHECK3-NEXT: [[TMP12:%.*]] = icmp eq i32 [[TMP11]], 1 +// CHECK3-NEXT: br i1 [[TMP12]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK3: .omp.reduction.then: +// CHECK3-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP0]], align 1 +// CHECK3-NEXT: [[CONV4:%.*]] = sext i8 [[TMP13]] to i32 +// CHECK3-NEXT: [[TMP14:%.*]] = load i8, i8* [[C1]], align 1 +// CHECK3-NEXT: [[CONV5:%.*]] = sext i8 [[TMP14]] to i32 +// CHECK3-NEXT: [[XOR6:%.*]] = xor i32 [[CONV4]], [[CONV5]] +// CHECK3-NEXT: [[CONV7:%.*]] = trunc i32 [[XOR6]] to i8 +// CHECK3-NEXT: store i8 [[CONV7]], i8* [[TMP0]], align 1 +// CHECK3-NEXT: [[TMP15:%.*]] = load float, float* [[TMP1]], align 4 +// CHECK3-NEXT: [[TMP16:%.*]] = load float, float* [[D_ON_STACK]], align 4 +// CHECK3-NEXT: [[MUL8:%.*]] = fmul float [[TMP15]], [[TMP16]] +// CHECK3-NEXT: store float [[MUL8]], float* [[TMP1]], align 4 +// CHECK3-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]]) +// CHECK3-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK3: .omp.reduction.done: +// CHECK3-NEXT: call void @__kmpc_free_shared(i8* [[D2]]) +// CHECK3-NEXT: call void @__kmpc_free_shared(i8* [[C1]]) +// CHECK3-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func6 -// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 -// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.3* -// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK6-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP5]] -// CHECK6-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 -// CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 -// CHECK6-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP5]] -// CHECK6-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8* -// CHECK6-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 4 -// CHECK6-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK6-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK6-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP11]], i8* [[TMP12]]) #[[ATTR3]] -// CHECK6-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func3 +// CHECK3-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK3-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK3-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK3-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i8, align 1 +// CHECK3-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca float, align 4 +// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK3-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK3-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK3-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* +// CHECK3-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK3-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK3-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1 +// CHECK3-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP10]], align 1 +// CHECK3-NEXT: [[TMP14:%.*]] = sext i8 [[TMP13]] to i32 +// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10]] +// CHECK3-NEXT: [[TMP15:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK3-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP14]], i16 [[TMP7]], i16 [[TMP15]]) +// CHECK3-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8 +// CHECK3-NEXT: store i8 [[TMP17]], i8* [[DOTOMP_REDUCTION_ELEMENT]], align 1 +// CHECK3-NEXT: [[TMP18:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1 +// CHECK3-NEXT: [[TMP19:%.*]] = getelementptr i8, i8* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 +// CHECK3-NEXT: store i8* [[DOTOMP_REDUCTION_ELEMENT]], i8** [[TMP11]], align 4 +// CHECK3-NEXT: [[TMP20:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP21:%.*]] = load i8*, i8** [[TMP20]], align 4 +// CHECK3-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP23:%.*]] = bitcast i8* [[TMP21]] to float* +// CHECK3-NEXT: [[TMP24:%.*]] = getelementptr float, float* [[TMP23]], i32 1 +// CHECK3-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to i8* +// CHECK3-NEXT: [[TMP26:%.*]] = bitcast float* [[TMP23]] to i32* +// CHECK3-NEXT: [[TMP27:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i32* +// CHECK3-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10]] +// CHECK3-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 +// CHECK3-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) +// CHECK3-NEXT: store i32 [[TMP30]], i32* [[TMP27]], align 4 +// CHECK3-NEXT: [[TMP31:%.*]] = getelementptr i32, i32* [[TMP26]], i32 1 +// CHECK3-NEXT: [[TMP32:%.*]] = getelementptr i32, i32* [[TMP27]], i32 1 +// CHECK3-NEXT: [[TMP33:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* +// CHECK3-NEXT: store i8* [[TMP33]], i8** [[TMP22]], align 4 +// CHECK3-NEXT: [[TMP34:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK3-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK3-NEXT: [[TMP36:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK3-NEXT: [[TMP37:%.*]] = and i1 [[TMP35]], [[TMP36]] +// CHECK3-NEXT: [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK3-NEXT: [[TMP39:%.*]] = and i16 [[TMP6]], 1 +// CHECK3-NEXT: [[TMP40:%.*]] = icmp eq i16 [[TMP39]], 0 +// CHECK3-NEXT: [[TMP41:%.*]] = and i1 [[TMP38]], [[TMP40]] +// CHECK3-NEXT: [[TMP42:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK3-NEXT: [[TMP43:%.*]] = and i1 [[TMP41]], [[TMP42]] +// CHECK3-NEXT: [[TMP44:%.*]] = or i1 [[TMP34]], [[TMP37]] +// CHECK3-NEXT: [[TMP45:%.*]] = or i1 [[TMP44]], [[TMP43]] +// CHECK3-NEXT: br i1 [[TMP45]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK3: then: +// CHECK3-NEXT: [[TMP46:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* +// CHECK3-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK3-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP46]], i8* [[TMP47]]) #[[ATTR3]] +// CHECK3-NEXT: br label [[IFCONT:%.*]] +// CHECK3: else: +// CHECK3-NEXT: br label [[IFCONT]] +// CHECK3: ifcont: +// CHECK3-NEXT: [[TMP48:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK3-NEXT: [[TMP49:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK3-NEXT: [[TMP50:%.*]] = and i1 [[TMP48]], [[TMP49]] +// CHECK3-NEXT: br i1 [[TMP50]], label [[THEN6:%.*]], label [[ELSE7:%.*]] +// CHECK3: then6: +// CHECK3-NEXT: [[TMP51:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP52:%.*]] = load i8*, i8** [[TMP51]], align 4 +// CHECK3-NEXT: [[TMP53:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP54:%.*]] = load i8*, i8** [[TMP53]], align 4 +// CHECK3-NEXT: [[TMP55:%.*]] = load i8, i8* [[TMP52]], align 1 +// CHECK3-NEXT: store i8 [[TMP55]], i8* [[TMP54]], align 1 +// CHECK3-NEXT: [[TMP56:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP57:%.*]] = load i8*, i8** [[TMP56]], align 4 +// CHECK3-NEXT: [[TMP58:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP59:%.*]] = load i8*, i8** [[TMP58]], align 4 +// CHECK3-NEXT: [[TMP60:%.*]] = bitcast i8* [[TMP57]] to float* +// CHECK3-NEXT: [[TMP61:%.*]] = bitcast i8* [[TMP59]] to float* +// CHECK3-NEXT: [[TMP62:%.*]] = load float, float* [[TMP60]], align 4 +// CHECK3-NEXT: store float [[TMP62]], float* [[TMP61]], align 4 +// CHECK3-NEXT: br label [[IFCONT8:%.*]] +// CHECK3: else7: +// CHECK3-NEXT: br label [[IFCONT8]] +// CHECK3: ifcont8: +// CHECK3-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func7 -// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK6-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.3* -// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 -// CHECK6-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP7]] -// CHECK6-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP10]], align 128 -// CHECK6-NEXT: store i8 [[TMP11]], i8* [[TMP9]], align 1 -// CHECK6-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 -// CHECK6-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float* -// CHECK6-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP15:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP7]] -// CHECK6-NEXT: [[TMP16:%.*]] = load float, float* [[TMP15]], align 128 -// CHECK6-NEXT: store float [[TMP16]], float* [[TMP14]], align 4 -// CHECK6-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func4 +// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG8]] +// CHECK3-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG8]] +// CHECK3-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK3-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG8]] +// CHECK3-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK3-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK3-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK3: then: +// CHECK3-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 +// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK3-NEXT: [[TMP8:%.*]] = bitcast i32 addrspace(3)* [[TMP7]] to i8 addrspace(3)* +// CHECK3-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP6]], align 1 +// CHECK3-NEXT: store volatile i8 [[TMP9]], i8 addrspace(3)* [[TMP8]], align 1 +// CHECK3-NEXT: br label [[IFCONT:%.*]] +// CHECK3: else: +// CHECK3-NEXT: br label [[IFCONT]] +// CHECK3: ifcont: +// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] +// CHECK3-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK3: then4: +// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK3-NEXT: [[TMP12:%.*]] = bitcast i32 addrspace(3)* [[TMP11]] to i8 addrspace(3)* +// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 +// CHECK3-NEXT: [[TMP15:%.*]] = load volatile i8, i8 addrspace(3)* [[TMP12]], align 1 +// CHECK3-NEXT: store i8 [[TMP15]], i8* [[TMP14]], align 1 +// CHECK3-NEXT: br label [[IFCONT6:%.*]] +// CHECK3: else5: +// CHECK3-NEXT: br label [[IFCONT6]] +// CHECK3: ifcont6: +// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK3-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK3-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] +// CHECK3: then8: +// CHECK3-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 +// CHECK3-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i32* +// CHECK3-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK3-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP18]], align 4 +// CHECK3-NEXT: store volatile i32 [[TMP20]], i32 addrspace(3)* [[TMP19]], align 4 +// CHECK3-NEXT: br label [[IFCONT10:%.*]] +// CHECK3: else9: +// CHECK3-NEXT: br label [[IFCONT10]] +// CHECK3: ifcont10: +// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK3-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP21]] +// CHECK3-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] +// CHECK3: then12: +// CHECK3-NEXT: [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK3-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP24:%.*]] = load i8*, i8** [[TMP23]], align 4 +// CHECK3-NEXT: [[TMP25:%.*]] = bitcast i8* [[TMP24]] to i32* +// CHECK3-NEXT: [[TMP26:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP22]], align 4 +// CHECK3-NEXT: store i32 [[TMP26]], i32* [[TMP25]], align 4 +// CHECK3-NEXT: br label [[IFCONT14:%.*]] +// CHECK3: else13: +// CHECK3-NEXT: br label [[IFCONT14]] +// CHECK3: ifcont14: +// CHECK3-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func8 -// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 -// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.3* -// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK6-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP5]] -// CHECK6-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 -// CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 -// CHECK6-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP5]] -// CHECK6-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8* -// CHECK6-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 4 -// CHECK6-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK6-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK6-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP12]], i8* [[TMP11]]) #[[ATTR3]] -// CHECK6-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func5 +// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK3-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0* +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK3-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP7]] +// CHECK3-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]], align 1 +// CHECK3-NEXT: store i8 [[TMP11]], i8* [[TMP10]], align 128 +// CHECK3-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 +// CHECK3-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float* +// CHECK3-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP15:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP7]] +// CHECK3-NEXT: [[TMP16:%.*]] = load float, float* [[TMP14]], align 4 +// CHECK3-NEXT: store float [[TMP16]], float* [[TMP15]], align 128 +// CHECK3-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l36 -// CHECK6-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[B]], i32* [[B_ADDR]], align 4 -// CHECK6-NEXT: [[CONV:%.*]] = bitcast i32* [[B_ADDR]] to i16* -// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK6-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK6-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() -// CHECK6-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK6: .execute: -// CHECK6-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) -// CHECK6-NEXT: store i32 [[TMP0]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK6-NEXT: call void @__omp_outlined__9(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[A_ADDR]], i16* [[CONV]]) #[[ATTR3]] -// CHECK6-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK6: .omp.deinit: -// CHECK6-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) -// CHECK6-NEXT: br label [[DOTEXIT:%.*]] -// CHECK6: .exit: -// CHECK6-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func6 +// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0* +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK3-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP5]] +// CHECK3-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK3-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP5]] +// CHECK3-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8* +// CHECK3-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK3-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK3-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP11]], i8* [[TMP12]]) #[[ATTR3]] +// CHECK3-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__9 -// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 4 -// CHECK6-NEXT: [[A1:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[B2:%.*]] = alloca i16, align 2 -// CHECK6-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4 -// CHECK6-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 -// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK6-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 -// CHECK6-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 4 -// CHECK6-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 -// CHECK6-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4 -// CHECK6-NEXT: store i32 0, i32* [[A1]], align 4 -// CHECK6-NEXT: store i16 -32768, i16* [[B2]], align 2 -// CHECK6-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP3:%.*]] = bitcast i32* [[A1]] to i8* -// CHECK6-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 4 -// CHECK6-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP5:%.*]] = bitcast i16* [[B2]] to i8* -// CHECK6-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 4 -// CHECK6-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 -// CHECK6-NEXT: [[TMP8:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*)* @__omp_outlined__10 to i8*), i8* null, i8** [[TMP8]], i32 2) -// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP10:%.*]] = bitcast i32* [[A1]] to i8* -// CHECK6-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 -// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP12:%.*]] = bitcast i16* [[B2]] to i8* -// CHECK6-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 -// CHECK6-NEXT: [[TMP13:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK6-NEXT: [[TMP14:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 -// CHECK6-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i8* [[TMP14]], i32 2048, i8* [[TMP13]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func15, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func16, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func17, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func18, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func19, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func20) -// CHECK6-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP15]], 1 -// CHECK6-NEXT: br i1 [[TMP16]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] -// CHECK6: .omp.reduction.then: -// CHECK6-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP0]], align 4 -// CHECK6-NEXT: [[TMP18:%.*]] = load i32, i32* [[A1]], align 4 -// CHECK6-NEXT: [[OR:%.*]] = or i32 [[TMP17]], [[TMP18]] -// CHECK6-NEXT: store i32 [[OR]], i32* [[TMP0]], align 4 -// CHECK6-NEXT: [[TMP19:%.*]] = load i16, i16* [[TMP1]], align 2 -// CHECK6-NEXT: [[CONV:%.*]] = sext i16 [[TMP19]] to i32 -// CHECK6-NEXT: [[TMP20:%.*]] = load i16, i16* [[B2]], align 2 -// CHECK6-NEXT: [[CONV3:%.*]] = sext i16 [[TMP20]] to i32 -// CHECK6-NEXT: [[CMP:%.*]] = icmp sgt i32 [[CONV]], [[CONV3]] -// CHECK6-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK6: cond.true: -// CHECK6-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP1]], align 2 -// CHECK6-NEXT: br label [[COND_END:%.*]] -// CHECK6: cond.false: -// CHECK6-NEXT: [[TMP22:%.*]] = load i16, i16* [[B2]], align 2 -// CHECK6-NEXT: br label [[COND_END]] -// CHECK6: cond.end: -// CHECK6-NEXT: [[COND:%.*]] = phi i16 [ [[TMP21]], [[COND_TRUE]] ], [ [[TMP22]], [[COND_FALSE]] ] -// CHECK6-NEXT: store i16 [[COND]], i16* [[TMP1]], align 2 -// CHECK6-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP7]]) -// CHECK6-NEXT: br label [[DOTOMP_REDUCTION_DONE]] -// CHECK6: .omp.reduction.done: -// CHECK6-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func7 +// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK3-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0* +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK3-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP7]] +// CHECK3-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP10]], align 128 +// CHECK3-NEXT: store i8 [[TMP11]], i8* [[TMP9]], align 1 +// CHECK3-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 +// CHECK3-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float* +// CHECK3-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP15:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP7]] +// CHECK3-NEXT: [[TMP16:%.*]] = load float, float* [[TMP15]], align 128 +// CHECK3-NEXT: store float [[TMP16]], float* [[TMP14]], align 4 +// CHECK3-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__10 -// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 4 -// CHECK6-NEXT: [[A1:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[B2:%.*]] = alloca i16, align 2 -// CHECK6-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 -// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK6-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 -// CHECK6-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 4 -// CHECK6-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 -// CHECK6-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4 -// CHECK6-NEXT: store i32 0, i32* [[A1]], align 4 -// CHECK6-NEXT: store i16 -32768, i16* [[B2]], align 2 -// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[A1]], align 4 -// CHECK6-NEXT: [[OR:%.*]] = or i32 [[TMP2]], 1 -// CHECK6-NEXT: store i32 [[OR]], i32* [[A1]], align 4 -// CHECK6-NEXT: [[TMP3:%.*]] = load i16, i16* [[B2]], align 2 -// CHECK6-NEXT: [[CONV:%.*]] = sext i16 [[TMP3]] to i32 -// CHECK6-NEXT: [[CMP:%.*]] = icmp sgt i32 99, [[CONV]] -// CHECK6-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK6: cond.true: -// CHECK6-NEXT: br label [[COND_END:%.*]] -// CHECK6: cond.false: -// CHECK6-NEXT: [[TMP4:%.*]] = load i16, i16* [[B2]], align 2 -// CHECK6-NEXT: [[CONV3:%.*]] = sext i16 [[TMP4]] to i32 -// CHECK6-NEXT: br label [[COND_END]] -// CHECK6: cond.end: -// CHECK6-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[CONV3]], [[COND_FALSE]] ] -// CHECK6-NEXT: [[CONV4:%.*]] = trunc i32 [[COND]] to i16 -// CHECK6-NEXT: store i16 [[CONV4]], i16* [[B2]], align 2 -// CHECK6-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 -// CHECK6-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP8:%.*]] = bitcast i32* [[A1]] to i8* -// CHECK6-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 4 -// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP10:%.*]] = bitcast i16* [[B2]] to i8* -// CHECK6-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 -// CHECK6-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK6-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP6]], i32 2, i32 8, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func12, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func13) -// CHECK6-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP12]], 1 -// CHECK6-NEXT: br i1 [[TMP13]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] -// CHECK6: .omp.reduction.then: -// CHECK6-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP0]], align 4 -// CHECK6-NEXT: [[TMP15:%.*]] = load i32, i32* [[A1]], align 4 -// CHECK6-NEXT: [[OR5:%.*]] = or i32 [[TMP14]], [[TMP15]] -// CHECK6-NEXT: store i32 [[OR5]], i32* [[TMP0]], align 4 -// CHECK6-NEXT: [[TMP16:%.*]] = load i16, i16* [[TMP1]], align 2 -// CHECK6-NEXT: [[CONV6:%.*]] = sext i16 [[TMP16]] to i32 -// CHECK6-NEXT: [[TMP17:%.*]] = load i16, i16* [[B2]], align 2 -// CHECK6-NEXT: [[CONV7:%.*]] = sext i16 [[TMP17]] to i32 -// CHECK6-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[CONV6]], [[CONV7]] -// CHECK6-NEXT: br i1 [[CMP8]], label [[COND_TRUE9:%.*]], label [[COND_FALSE10:%.*]] -// CHECK6: cond.true9: -// CHECK6-NEXT: [[TMP18:%.*]] = load i16, i16* [[TMP1]], align 2 -// CHECK6-NEXT: br label [[COND_END11:%.*]] -// CHECK6: cond.false10: -// CHECK6-NEXT: [[TMP19:%.*]] = load i16, i16* [[B2]], align 2 -// CHECK6-NEXT: br label [[COND_END11]] -// CHECK6: cond.end11: -// CHECK6-NEXT: [[COND12:%.*]] = phi i16 [ [[TMP18]], [[COND_TRUE9]] ], [ [[TMP19]], [[COND_FALSE10]] ] -// CHECK6-NEXT: store i16 [[COND12]], i16* [[TMP1]], align 2 -// CHECK6-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP6]]) -// CHECK6-NEXT: br label [[DOTOMP_REDUCTION_DONE]] -// CHECK6: .omp.reduction.done: -// CHECK6-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func8 +// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0* +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK3-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP5]] +// CHECK3-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK3-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP5]] +// CHECK3-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8* +// CHECK3-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK3-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK3-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP12]], i8* [[TMP11]]) #[[ATTR3]] +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l34 +// CHECK3-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[B]], i32* [[B_ADDR]], align 4 +// CHECK3-NEXT: [[CONV:%.*]] = bitcast i32* [[B_ADDR]] to i16* +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range [[RNG9]] +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) +// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK3: .execute: +// CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK3-NEXT: store i32 [[TMP0]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK3-NEXT: call void @__omp_outlined__9(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[A_ADDR]], i16* [[CONV]]) #[[ATTR3]] +// CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK3: .omp.deinit: +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func12 -// CHECK6-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 -// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 -// CHECK6-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 -// CHECK6-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 -// CHECK6-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 -// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 -// CHECK6-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 -// CHECK6-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 -// CHECK6-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* -// CHECK6-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 -// CHECK6-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 -// CHECK6-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 -// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 -// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* -// CHECK6-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 -// CHECK6-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* -// CHECK6-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 -// CHECK6-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK6-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 -// CHECK6-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]]) -// CHECK6-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 -// CHECK6-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 -// CHECK6-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 -// CHECK6-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* -// CHECK6-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 4 -// CHECK6-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 4 -// CHECK6-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16* -// CHECK6-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 -// CHECK6-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8* -// CHECK6-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2 -// CHECK6-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32 -// CHECK6-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK6-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 -// CHECK6-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) -// CHECK6-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 -// CHECK6-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 -// CHECK6-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 -// CHECK6-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1 -// CHECK6-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* -// CHECK6-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 4 -// CHECK6-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0 -// CHECK6-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK6-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] -// CHECK6-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]] -// CHECK6-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2 -// CHECK6-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1 -// CHECK6-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0 -// CHECK6-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]] -// CHECK6-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0 -// CHECK6-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]] -// CHECK6-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]] -// CHECK6-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]] -// CHECK6-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK6: then: -// CHECK6-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* -// CHECK6-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* -// CHECK6-NEXT: call void @"_omp$reduction$reduction_func11"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]] -// CHECK6-NEXT: br label [[IFCONT:%.*]] -// CHECK6: else: -// CHECK6-NEXT: br label [[IFCONT]] -// CHECK6: ifcont: -// CHECK6-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK6-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] -// CHECK6-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]] -// CHECK6-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]] -// CHECK6: then6: -// CHECK6-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 4 -// CHECK6-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4 -// CHECK6-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32* -// CHECK6-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32* -// CHECK6-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4 -// CHECK6-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4 -// CHECK6-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 4 -// CHECK6-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4 -// CHECK6-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16* -// CHECK6-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16* -// CHECK6-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2 -// CHECK6-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2 -// CHECK6-NEXT: br label [[IFCONT8:%.*]] -// CHECK6: else7: -// CHECK6-NEXT: br label [[IFCONT8]] -// CHECK6: ifcont8: -// CHECK6-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__9 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 4 +// CHECK3-NEXT: [[A1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[B2:%.*]] = alloca i16, align 2 +// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4 +// CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK3-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[A1]], align 4 +// CHECK3-NEXT: store i16 -32768, i16* [[B2]], align 2 +// CHECK3-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP3:%.*]] = bitcast i32* [[A1]] to i8* +// CHECK3-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i16* [[B2]] to i8* +// CHECK3-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*)* @__omp_outlined__10 to i8*), i8* null, i8** [[TMP8]], i32 2) +// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP10:%.*]] = bitcast i32* [[A1]] to i8* +// CHECK3-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP12:%.*]] = bitcast i16* [[B2]] to i8* +// CHECK3-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK3-NEXT: [[TMP13:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK3-NEXT: [[TMP14:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 +// CHECK3-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i8* [[TMP14]], i32 2048, i8* [[TMP13]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func15, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func16, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func17, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func18, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func19, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func20) +// CHECK3-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP15]], 1 +// CHECK3-NEXT: br i1 [[TMP16]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK3: .omp.reduction.then: +// CHECK3-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[A1]], align 4 +// CHECK3-NEXT: [[OR:%.*]] = or i32 [[TMP17]], [[TMP18]] +// CHECK3-NEXT: store i32 [[OR]], i32* [[TMP0]], align 4 +// CHECK3-NEXT: [[TMP19:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK3-NEXT: [[CONV:%.*]] = sext i16 [[TMP19]] to i32 +// CHECK3-NEXT: [[TMP20:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK3-NEXT: [[CONV3:%.*]] = sext i16 [[TMP20]] to i32 +// CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 [[CONV]], [[CONV3]] +// CHECK3-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK3: cond.true: +// CHECK3-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK3-NEXT: br label [[COND_END:%.*]] +// CHECK3: cond.false: +// CHECK3-NEXT: [[TMP22:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK3-NEXT: br label [[COND_END]] +// CHECK3: cond.end: +// CHECK3-NEXT: [[COND:%.*]] = phi i16 [ [[TMP21]], [[COND_TRUE]] ], [ [[TMP22]], [[COND_FALSE]] ] +// CHECK3-NEXT: store i16 [[COND]], i16* [[TMP1]], align 2 +// CHECK3-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP7]]) +// CHECK3-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK3: .omp.reduction.done: +// CHECK3-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func13 -// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK6-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK6-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 -// CHECK6-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK6-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 -// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4:[0-9]+]], i32 [[TMP2]]) -// CHECK6-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK6-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK6: then: -// CHECK6-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 -// CHECK6-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* -// CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK6-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4 -// CHECK6-NEXT: br label [[IFCONT:%.*]] -// CHECK6: else: -// CHECK6-NEXT: br label [[IFCONT]] -// CHECK6: ifcont: -// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] -// CHECK6-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] -// CHECK6: then4: -// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK6-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 -// CHECK6-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32* -// CHECK6-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4 -// CHECK6-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4 -// CHECK6-NEXT: br label [[IFCONT6:%.*]] -// CHECK6: else5: -// CHECK6-NEXT: br label [[IFCONT6]] -// CHECK6: ifcont6: -// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK6-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK6-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] -// CHECK6: then8: -// CHECK6-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 -// CHECK6-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16* -// CHECK6-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK6-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)* -// CHECK6-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2 -// CHECK6-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2 -// CHECK6-NEXT: br label [[IFCONT10:%.*]] -// CHECK6: else9: -// CHECK6-NEXT: br label [[IFCONT10]] -// CHECK6: ifcont10: -// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK6-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]] -// CHECK6-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] -// CHECK6: then12: -// CHECK6-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK6-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)* -// CHECK6-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 4 -// CHECK6-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16* -// CHECK6-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2 -// CHECK6-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2 -// CHECK6-NEXT: br label [[IFCONT14:%.*]] -// CHECK6: else13: -// CHECK6-NEXT: br label [[IFCONT14]] -// CHECK6: ifcont14: -// CHECK6-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__10 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 4 +// CHECK3-NEXT: [[A1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[B2:%.*]] = alloca i16, align 2 +// CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK3-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[A1]], align 4 +// CHECK3-NEXT: store i16 -32768, i16* [[B2]], align 2 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[A1]], align 4 +// CHECK3-NEXT: [[OR:%.*]] = or i32 [[TMP2]], 1 +// CHECK3-NEXT: store i32 [[OR]], i32* [[A1]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK3-NEXT: [[CONV:%.*]] = sext i16 [[TMP3]] to i32 +// CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 99, [[CONV]] +// CHECK3-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK3: cond.true: +// CHECK3-NEXT: br label [[COND_END:%.*]] +// CHECK3: cond.false: +// CHECK3-NEXT: [[TMP4:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK3-NEXT: [[CONV3:%.*]] = sext i16 [[TMP4]] to i32 +// CHECK3-NEXT: br label [[COND_END]] +// CHECK3: cond.end: +// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[CONV3]], [[COND_FALSE]] ] +// CHECK3-NEXT: [[CONV4:%.*]] = trunc i32 [[COND]] to i16 +// CHECK3-NEXT: store i16 [[CONV4]], i16* [[B2]], align 2 +// CHECK3-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP8:%.*]] = bitcast i32* [[A1]] to i8* +// CHECK3-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 4 +// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP10:%.*]] = bitcast i16* [[B2]] to i8* +// CHECK3-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK3-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP6]], i32 2, i32 8, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func12, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func13) +// CHECK3-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP12]], 1 +// CHECK3-NEXT: br i1 [[TMP13]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK3: .omp.reduction.then: +// CHECK3-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[A1]], align 4 +// CHECK3-NEXT: [[OR5:%.*]] = or i32 [[TMP14]], [[TMP15]] +// CHECK3-NEXT: store i32 [[OR5]], i32* [[TMP0]], align 4 +// CHECK3-NEXT: [[TMP16:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK3-NEXT: [[CONV6:%.*]] = sext i16 [[TMP16]] to i32 +// CHECK3-NEXT: [[TMP17:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK3-NEXT: [[CONV7:%.*]] = sext i16 [[TMP17]] to i32 +// CHECK3-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[CONV6]], [[CONV7]] +// CHECK3-NEXT: br i1 [[CMP8]], label [[COND_TRUE9:%.*]], label [[COND_FALSE10:%.*]] +// CHECK3: cond.true9: +// CHECK3-NEXT: [[TMP18:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK3-NEXT: br label [[COND_END11:%.*]] +// CHECK3: cond.false10: +// CHECK3-NEXT: [[TMP19:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK3-NEXT: br label [[COND_END11]] +// CHECK3: cond.end11: +// CHECK3-NEXT: [[COND12:%.*]] = phi i16 [ [[TMP18]], [[COND_TRUE9]] ], [ [[TMP19]], [[COND_FALSE10]] ] +// CHECK3-NEXT: store i16 [[COND12]], i16* [[TMP1]], align 2 +// CHECK3-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP6]]) +// CHECK3-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK3: .omp.reduction.done: +// CHECK3-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func15 -// CHECK6-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 -// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 -// CHECK6-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 -// CHECK6-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 -// CHECK6-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 -// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 -// CHECK6-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 -// CHECK6-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 -// CHECK6-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* -// CHECK6-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 -// CHECK6-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 -// CHECK6-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 -// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 -// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* -// CHECK6-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 -// CHECK6-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* -// CHECK6-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 -// CHECK6-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK6-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 -// CHECK6-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]]) -// CHECK6-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 -// CHECK6-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 -// CHECK6-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 -// CHECK6-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* -// CHECK6-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 4 -// CHECK6-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 4 -// CHECK6-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16* -// CHECK6-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 -// CHECK6-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8* -// CHECK6-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2 -// CHECK6-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32 -// CHECK6-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK6-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 -// CHECK6-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) -// CHECK6-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 -// CHECK6-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 -// CHECK6-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 -// CHECK6-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1 -// CHECK6-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* -// CHECK6-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 4 -// CHECK6-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0 -// CHECK6-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK6-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] -// CHECK6-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]] -// CHECK6-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2 -// CHECK6-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1 -// CHECK6-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0 -// CHECK6-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]] -// CHECK6-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0 -// CHECK6-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]] -// CHECK6-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]] -// CHECK6-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]] -// CHECK6-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK6: then: -// CHECK6-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* -// CHECK6-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* -// CHECK6-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]] -// CHECK6-NEXT: br label [[IFCONT:%.*]] -// CHECK6: else: -// CHECK6-NEXT: br label [[IFCONT]] -// CHECK6: ifcont: -// CHECK6-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK6-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] -// CHECK6-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]] -// CHECK6-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]] -// CHECK6: then6: -// CHECK6-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 4 -// CHECK6-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4 -// CHECK6-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32* -// CHECK6-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32* -// CHECK6-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4 -// CHECK6-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4 -// CHECK6-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 4 -// CHECK6-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4 -// CHECK6-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16* -// CHECK6-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16* -// CHECK6-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2 -// CHECK6-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2 -// CHECK6-NEXT: br label [[IFCONT8:%.*]] -// CHECK6: else7: -// CHECK6-NEXT: br label [[IFCONT8]] -// CHECK6: ifcont8: -// CHECK6-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func12 +// CHECK3-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK3-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK3-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK3-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 +// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK3-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK3-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK3-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* +// CHECK3-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK3-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK3-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* +// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 +// CHECK3-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* +// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 +// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10]] +// CHECK3-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK3-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]]) +// CHECK3-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 +// CHECK3-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 +// CHECK3-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 +// CHECK3-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* +// CHECK3-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 4 +// CHECK3-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 4 +// CHECK3-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16* +// CHECK3-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 +// CHECK3-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8* +// CHECK3-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2 +// CHECK3-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32 +// CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10]] +// CHECK3-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 +// CHECK3-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) +// CHECK3-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 +// CHECK3-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 +// CHECK3-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 +// CHECK3-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1 +// CHECK3-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* +// CHECK3-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 4 +// CHECK3-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK3-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK3-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK3-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]] +// CHECK3-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK3-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1 +// CHECK3-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0 +// CHECK3-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]] +// CHECK3-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK3-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]] +// CHECK3-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]] +// CHECK3-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]] +// CHECK3-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK3: then: +// CHECK3-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* +// CHECK3-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK3-NEXT: call void @"_omp$reduction$reduction_func11"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]] +// CHECK3-NEXT: br label [[IFCONT:%.*]] +// CHECK3: else: +// CHECK3-NEXT: br label [[IFCONT]] +// CHECK3: ifcont: +// CHECK3-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK3-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK3-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]] +// CHECK3-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]] +// CHECK3: then6: +// CHECK3-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 4 +// CHECK3-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4 +// CHECK3-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32* +// CHECK3-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32* +// CHECK3-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4 +// CHECK3-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4 +// CHECK3-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 4 +// CHECK3-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4 +// CHECK3-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16* +// CHECK3-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16* +// CHECK3-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2 +// CHECK3-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2 +// CHECK3-NEXT: br label [[IFCONT8:%.*]] +// CHECK3: else7: +// CHECK3-NEXT: br label [[IFCONT8]] +// CHECK3: ifcont8: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func13 +// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG8]] +// CHECK3-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG8]] +// CHECK3-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK3-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG8]] +// CHECK3-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4:[0-9]+]], i32 [[TMP2]]) +// CHECK3-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK3-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK3: then: +// CHECK3-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 +// CHECK3-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* +// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK3-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4 +// CHECK3-NEXT: br label [[IFCONT:%.*]] +// CHECK3: else: +// CHECK3-NEXT: br label [[IFCONT]] +// CHECK3: ifcont: +// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] +// CHECK3-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK3: then4: +// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK3-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 +// CHECK3-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32* +// CHECK3-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4 +// CHECK3-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4 +// CHECK3-NEXT: br label [[IFCONT6:%.*]] +// CHECK3: else5: +// CHECK3-NEXT: br label [[IFCONT6]] +// CHECK3: ifcont6: +// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK3-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK3-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] +// CHECK3: then8: +// CHECK3-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 +// CHECK3-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16* +// CHECK3-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK3-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)* +// CHECK3-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2 +// CHECK3-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2 +// CHECK3-NEXT: br label [[IFCONT10:%.*]] +// CHECK3: else9: +// CHECK3-NEXT: br label [[IFCONT10]] +// CHECK3: ifcont10: +// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK3-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]] +// CHECK3-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] +// CHECK3: then12: +// CHECK3-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK3-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)* +// CHECK3-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 4 +// CHECK3-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16* +// CHECK3-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2 +// CHECK3-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2 +// CHECK3-NEXT: br label [[IFCONT14:%.*]] +// CHECK3: else13: +// CHECK3-NEXT: br label [[IFCONT14]] +// CHECK3: ifcont14: +// CHECK3-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func16 -// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK6-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK6-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 -// CHECK6-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK6-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 -// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK6-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK6-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK6: then: -// CHECK6-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 -// CHECK6-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* -// CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK6-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4 -// CHECK6-NEXT: br label [[IFCONT:%.*]] -// CHECK6: else: -// CHECK6-NEXT: br label [[IFCONT]] -// CHECK6: ifcont: -// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] -// CHECK6-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] -// CHECK6: then4: -// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK6-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 -// CHECK6-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32* -// CHECK6-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4 -// CHECK6-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4 -// CHECK6-NEXT: br label [[IFCONT6:%.*]] -// CHECK6: else5: -// CHECK6-NEXT: br label [[IFCONT6]] -// CHECK6: ifcont6: -// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK6-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK6-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] -// CHECK6: then8: -// CHECK6-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 -// CHECK6-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16* -// CHECK6-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK6-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)* -// CHECK6-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2 -// CHECK6-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2 -// CHECK6-NEXT: br label [[IFCONT10:%.*]] -// CHECK6: else9: -// CHECK6-NEXT: br label [[IFCONT10]] -// CHECK6: ifcont10: -// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK6-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]] -// CHECK6-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] -// CHECK6: then12: -// CHECK6-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK6-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)* -// CHECK6-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 4 -// CHECK6-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16* -// CHECK6-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2 -// CHECK6-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2 -// CHECK6-NEXT: br label [[IFCONT14:%.*]] -// CHECK6: else13: -// CHECK6-NEXT: br label [[IFCONT14]] -// CHECK6: ifcont14: -// CHECK6-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func15 +// CHECK3-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK3-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK3-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK3-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 +// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK3-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK3-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK3-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* +// CHECK3-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK3-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK3-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* +// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 +// CHECK3-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* +// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 +// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10]] +// CHECK3-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK3-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]]) +// CHECK3-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 +// CHECK3-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 +// CHECK3-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 +// CHECK3-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* +// CHECK3-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 4 +// CHECK3-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 4 +// CHECK3-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16* +// CHECK3-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 +// CHECK3-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8* +// CHECK3-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2 +// CHECK3-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32 +// CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range [[RNG10]] +// CHECK3-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 +// CHECK3-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) +// CHECK3-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 +// CHECK3-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 +// CHECK3-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 +// CHECK3-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1 +// CHECK3-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* +// CHECK3-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 4 +// CHECK3-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK3-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK3-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK3-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]] +// CHECK3-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK3-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1 +// CHECK3-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0 +// CHECK3-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]] +// CHECK3-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK3-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]] +// CHECK3-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]] +// CHECK3-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]] +// CHECK3-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK3: then: +// CHECK3-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* +// CHECK3-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK3-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]] +// CHECK3-NEXT: br label [[IFCONT:%.*]] +// CHECK3: else: +// CHECK3-NEXT: br label [[IFCONT]] +// CHECK3: ifcont: +// CHECK3-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK3-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK3-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]] +// CHECK3-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]] +// CHECK3: then6: +// CHECK3-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 4 +// CHECK3-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4 +// CHECK3-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32* +// CHECK3-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32* +// CHECK3-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4 +// CHECK3-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4 +// CHECK3-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 4 +// CHECK3-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4 +// CHECK3-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16* +// CHECK3-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16* +// CHECK3-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2 +// CHECK3-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2 +// CHECK3-NEXT: br label [[IFCONT8:%.*]] +// CHECK3: else7: +// CHECK3-NEXT: br label [[IFCONT8]] +// CHECK3: ifcont8: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func16 +// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG8]] +// CHECK3-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG8]] +// CHECK3-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK3-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range [[RNG8]] +// CHECK3-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK3-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK3-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK3: then: +// CHECK3-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 +// CHECK3-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* +// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK3-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4 +// CHECK3-NEXT: br label [[IFCONT:%.*]] +// CHECK3: else: +// CHECK3-NEXT: br label [[IFCONT]] +// CHECK3: ifcont: +// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] +// CHECK3-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK3: then4: +// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK3-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 +// CHECK3-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32* +// CHECK3-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4 +// CHECK3-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4 +// CHECK3-NEXT: br label [[IFCONT6:%.*]] +// CHECK3: else5: +// CHECK3-NEXT: br label [[IFCONT6]] +// CHECK3: ifcont6: +// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK3-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK3-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] +// CHECK3: then8: +// CHECK3-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 +// CHECK3-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16* +// CHECK3-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK3-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)* +// CHECK3-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2 +// CHECK3-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2 +// CHECK3-NEXT: br label [[IFCONT10:%.*]] +// CHECK3: else9: +// CHECK3-NEXT: br label [[IFCONT10]] +// CHECK3: ifcont10: +// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK3-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]] +// CHECK3-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] +// CHECK3: then12: +// CHECK3-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK3-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)* +// CHECK3-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 4 +// CHECK3-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16* +// CHECK3-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2 +// CHECK3-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2 +// CHECK3-NEXT: br label [[IFCONT14:%.*]] +// CHECK3: else13: +// CHECK3-NEXT: br label [[IFCONT14]] +// CHECK3: ifcont14: +// CHECK3-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func17 -// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK6-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.4* -// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 -// CHECK6-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* -// CHECK6-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP7]] -// CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 -// CHECK6-NEXT: store i32 [[TMP12]], i32* [[TMP11]], align 128 -// CHECK6-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 -// CHECK6-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16* -// CHECK6-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP7]] -// CHECK6-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP15]], align 2 -// CHECK6-NEXT: store i16 [[TMP17]], i16* [[TMP16]], align 128 -// CHECK6-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func17 +// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK3-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.1* +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK3-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* +// CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP7]] +// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK3-NEXT: store i32 [[TMP12]], i32* [[TMP11]], align 128 +// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 +// CHECK3-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16* +// CHECK3-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP7]] +// CHECK3-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP15]], align 2 +// CHECK3-NEXT: store i16 [[TMP17]], i16* [[TMP16]], align 128 +// CHECK3-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func18 -// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 -// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.4* -// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK6-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP5]] -// CHECK6-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* -// CHECK6-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 -// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 -// CHECK6-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP5]] -// CHECK6-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8* -// CHECK6-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 4 -// CHECK6-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK6-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK6-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP12]], i8* [[TMP13]]) #[[ATTR3]] -// CHECK6-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func18 +// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.1* +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP4]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP5]] +// CHECK3-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* +// CHECK3-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 +// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK3-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP4]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP5]] +// CHECK3-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8* +// CHECK3-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 4 +// CHECK3-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK3-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK3-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP12]], i8* [[TMP13]]) #[[ATTR3]] +// CHECK3-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func19 -// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK6-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.4* -// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 -// CHECK6-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* -// CHECK6-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP7]] -// CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 128 -// CHECK6-NEXT: store i32 [[TMP12]], i32* [[TMP10]], align 4 -// CHECK6-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 -// CHECK6-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16* -// CHECK6-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP7]] -// CHECK6-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP16]], align 128 -// CHECK6-NEXT: store i16 [[TMP17]], i16* [[TMP15]], align 2 -// CHECK6-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func19 +// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK3-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.1* +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK3-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* +// CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP7]] +// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 128 +// CHECK3-NEXT: store i32 [[TMP12]], i32* [[TMP10]], align 4 +// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 +// CHECK3-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16* +// CHECK3-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP7]] +// CHECK3-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP16]], align 128 +// CHECK3-NEXT: store i16 [[TMP17]], i16* [[TMP15]], align 2 +// CHECK3-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func20 -// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 -// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.4* -// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK6-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP5]] -// CHECK6-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* -// CHECK6-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 -// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 -// CHECK6-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP5]] -// CHECK6-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8* -// CHECK6-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 4 -// CHECK6-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK6-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK6-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP13]], i8* [[TMP12]]) #[[ATTR3]] -// CHECK6-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func20 +// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.1* +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP4]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP5]] +// CHECK3-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* +// CHECK3-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 +// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK3-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP4]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP5]] +// CHECK3-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8* +// CHECK3-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 4 +// CHECK3-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK3-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK3-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP13]], i8* [[TMP12]]) #[[ATTR3]] +// CHECK3-NEXT: ret void // diff --git a/clang/test/OpenMP/target_parallel_debug_codegen.cpp b/clang/test/OpenMP/target_parallel_debug_codegen.cpp --- a/clang/test/OpenMP/target_parallel_debug_codegen.cpp +++ b/clang/test/OpenMP/target_parallel_debug_codegen.cpp @@ -95,39 +95,38 @@ // CHECK-NEXT: [[TMP6:%.*]] = addrspacecast i8 addrspace(1)* [[TMP5]] to i8*, !dbg [[DBG45]] // CHECK-NEXT: store i8* [[TMP6]], i8** [[_TMP2]], align 8, !dbg [[DBG45]] // CHECK-NEXT: [[TMP7:%.*]] = load i8*, i8** [[_TMP2]], align 8, !dbg [[DBG45]] -// CHECK-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !dbg [[DBG45]] +// CHECK-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !dbg [[DBG45]], !range [[RNG46:![0-9]+]] // CHECK-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1), !dbg [[DBG45]] -// CHECK-NEXT: call void @__kmpc_data_sharing_init_stack_spmd(), !dbg [[DBG45]] // CHECK-NEXT: br label [[DOTEXECUTE:%.*]], !dbg [[DBG45]] // CHECK: .execute: // CHECK-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) -// CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg [[DBG46:![0-9]+]] -// CHECK-NEXT: [[CONV:%.*]] = bitcast i64* [[A_CASTED]] to i32*, !dbg [[DBG46]] -// CHECK-NEXT: store i32 [[TMP9]], i32* [[CONV]], align 4, !dbg [[DBG46]] -// CHECK-NEXT: [[TMP10:%.*]] = load i64, i64* [[A_CASTED]], align 8, !dbg [[DBG46]] -// CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG46]] -// CHECK-NEXT: [[TMP12:%.*]] = bitcast [10 x [10 x [10 x i32]]]* [[TMP2]] to i8*, !dbg [[DBG46]] -// CHECK-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 8, !dbg [[DBG46]] -// CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG46]] -// CHECK-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP10]] to i8*, !dbg [[DBG46]] -// CHECK-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8, !dbg [[DBG46]] -// CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG46]] -// CHECK-NEXT: [[TMP16:%.*]] = bitcast [10 x [10 x i32]]* [[TMP4]] to i8*, !dbg [[DBG46]] -// CHECK-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8, !dbg [[DBG46]] -// CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG46]] -// CHECK-NEXT: store i8* [[TMP7]], i8** [[TMP17]], align 8, !dbg [[DBG46]] -// CHECK-NEXT: [[TMP18:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**, !dbg [[DBG46]] -// CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP8]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, [10 x [10 x [10 x i32]]]*, i64, [10 x [10 x i32]]*, i8*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP18]], i64 4), !dbg [[DBG46]] -// CHECK-NEXT: br label [[DOTOMP_DEINIT:%.*]], !dbg [[DBG47:![0-9]+]] +// CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg [[DBG47:![0-9]+]] +// CHECK-NEXT: [[CONV:%.*]] = bitcast i64* [[A_CASTED]] to i32*, !dbg [[DBG47]] +// CHECK-NEXT: store i32 [[TMP9]], i32* [[CONV]], align 4, !dbg [[DBG47]] +// CHECK-NEXT: [[TMP10:%.*]] = load i64, i64* [[A_CASTED]], align 8, !dbg [[DBG47]] +// CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG47]] +// CHECK-NEXT: [[TMP12:%.*]] = bitcast [10 x [10 x [10 x i32]]]* [[TMP2]] to i8*, !dbg [[DBG47]] +// CHECK-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 8, !dbg [[DBG47]] +// CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG47]] +// CHECK-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP10]] to i8*, !dbg [[DBG47]] +// CHECK-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8, !dbg [[DBG47]] +// CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG47]] +// CHECK-NEXT: [[TMP16:%.*]] = bitcast [10 x [10 x i32]]* [[TMP4]] to i8*, !dbg [[DBG47]] +// CHECK-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8, !dbg [[DBG47]] +// CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG47]] +// CHECK-NEXT: store i8* [[TMP7]], i8** [[TMP17]], align 8, !dbg [[DBG47]] +// CHECK-NEXT: [[TMP18:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**, !dbg [[DBG47]] +// CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP8]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, [10 x [10 x [10 x i32]]]*, i64, [10 x [10 x i32]]*, i8*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP18]], i64 4), !dbg [[DBG47]] +// CHECK-NEXT: br label [[DOTOMP_DEINIT:%.*]], !dbg [[DBG48:![0-9]+]] // CHECK: .omp.deinit: -// CHECK-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1), !dbg [[DBG47]] -// CHECK-NEXT: br label [[DOTEXIT:%.*]], !dbg [[DBG47]] +// CHECK-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1), !dbg [[DBG48]] +// CHECK-NEXT: br label [[DOTEXIT:%.*]], !dbg [[DBG48]] // CHECK: .exit: -// CHECK-NEXT: ret void, !dbg [[DBG49:![0-9]+]] +// CHECK-NEXT: ret void, !dbg [[DBG50:![0-9]+]] // // // CHECK-LABEL: define {{[^@]+}}@__omp_outlined___debug__ -// CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x [10 x i32]]] addrspace(1)* noalias [[C:%.*]], i32 [[A:%.*]], [10 x [10 x i32]]* noalias [[B:%.*]], i8 addrspace(1)* noalias [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG50:![0-9]+]] { +// CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x [10 x i32]]] addrspace(1)* noalias [[C:%.*]], i32 [[A:%.*]], [10 x [10 x i32]]* noalias [[B:%.*]], i8 addrspace(1)* noalias [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG51:![0-9]+]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 // CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 @@ -144,85 +143,85 @@ // CHECK-NEXT: [[H:%.*]] = alloca i32*, align 8 // CHECK-NEXT: [[D:%.*]] = alloca i32, align 4 // CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[DOTGLOBAL_TID__ADDR]], metadata [[META57:![0-9]+]], metadata !DIExpression()), !dbg [[DBG58:![0-9]+]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[DOTGLOBAL_TID__ADDR]], metadata [[META58:![0-9]+]], metadata !DIExpression()), !dbg [[DBG59:![0-9]+]] // CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[DOTBOUND_TID__ADDR]], metadata [[META59:![0-9]+]], metadata !DIExpression()), !dbg [[DBG58]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[DOTBOUND_TID__ADDR]], metadata [[META60:![0-9]+]], metadata !DIExpression()), !dbg [[DBG59]] // CHECK-NEXT: store [10 x [10 x [10 x i32]]] addrspace(1)* [[C]], [10 x [10 x [10 x i32]]] addrspace(1)** [[C_ADDR]], align 8 -// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x [10 x i32]]] addrspace(1)** [[C_ADDR]], metadata [[META60:![0-9]+]], metadata !DIExpression()), !dbg [[DBG61:![0-9]+]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x [10 x i32]]] addrspace(1)** [[C_ADDR]], metadata [[META61:![0-9]+]], metadata !DIExpression()), !dbg [[DBG62:![0-9]+]] // CHECK-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 -// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32* [[A_ADDR]], metadata [[META62:![0-9]+]], metadata !DIExpression()), !dbg [[DBG63:![0-9]+]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32* [[A_ADDR]], metadata [[META63:![0-9]+]], metadata !DIExpression()), !dbg [[DBG64:![0-9]+]] // CHECK-NEXT: store [10 x [10 x i32]]* [[B]], [10 x [10 x i32]]** [[B_ADDR]], align 8 -// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x i32]]** [[B_ADDR]], metadata [[META64:![0-9]+]], metadata !DIExpression()), !dbg [[DBG65:![0-9]+]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x i32]]** [[B_ADDR]], metadata [[META65:![0-9]+]], metadata !DIExpression()), !dbg [[DBG66:![0-9]+]] // CHECK-NEXT: store i8 addrspace(1)* [[BB]], i8 addrspace(1)** [[BB_ADDR]], align 8 -// CHECK-NEXT: call void @llvm.dbg.declare(metadata i8 addrspace(1)** [[BB_ADDR]], metadata [[META66:![0-9]+]], metadata !DIExpression()), !dbg [[DBG67:![0-9]+]] -// CHECK-NEXT: [[TMP0:%.*]] = load [10 x [10 x [10 x i32]]] addrspace(1)*, [10 x [10 x [10 x i32]]] addrspace(1)** [[C_ADDR]], align 8, !dbg [[DBG68:![0-9]+]] -// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast [10 x [10 x [10 x i32]]] addrspace(1)* [[TMP0]] to [10 x [10 x [10 x i32]]]*, !dbg [[DBG68]] -// CHECK-NEXT: store [10 x [10 x [10 x i32]]]* [[TMP1]], [10 x [10 x [10 x i32]]]** [[TMP]], align 8, !dbg [[DBG68]] -// CHECK-NEXT: [[TMP2:%.*]] = load [10 x [10 x [10 x i32]]]*, [10 x [10 x [10 x i32]]]** [[TMP]], align 8, !dbg [[DBG68]] -// CHECK-NEXT: [[TMP3:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[B_ADDR]], align 8, !dbg [[DBG68]] -// CHECK-NEXT: store [10 x [10 x i32]]* [[TMP3]], [10 x [10 x i32]]** [[_TMP1]], align 8, !dbg [[DBG68]] -// CHECK-NEXT: [[TMP4:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[_TMP1]], align 8, !dbg [[DBG68]] -// CHECK-NEXT: [[TMP5:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)** [[BB_ADDR]], align 8, !dbg [[DBG68]] -// CHECK-NEXT: [[TMP6:%.*]] = addrspacecast i8 addrspace(1)* [[TMP5]] to i8*, !dbg [[DBG68]] -// CHECK-NEXT: store i8* [[TMP6]], i8** [[_TMP2]], align 8, !dbg [[DBG68]] -// CHECK-NEXT: [[TMP7:%.*]] = load i8*, i8** [[_TMP2]], align 8, !dbg [[DBG68]] -// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x i32]]* [[B3]], metadata [[META69:![0-9]+]], metadata !DIExpression()), !dbg [[DBG58]] -// CHECK-NEXT: [[TMP8:%.*]] = bitcast [10 x [10 x i32]]* [[B3]] to i8*, !dbg [[DBG68]] -// CHECK-NEXT: [[TMP9:%.*]] = bitcast [10 x [10 x i32]]* [[TMP4]] to i8*, !dbg [[DBG68]] -// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP8]], i8* align 4 [[TMP9]], i64 400, i1 false), !dbg [[DBG68]] -// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[F]], metadata [[META70:![0-9]+]], metadata !DIExpression()), !dbg [[DBG73:![0-9]+]] -// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], [10 x [10 x [10 x i32]]]* [[TMP2]], i64 0, i64 1, !dbg [[DBG74:![0-9]+]] -// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG74]] -// CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX4]], i64 0, i64 1, !dbg [[DBG74]] -// CHECK-NEXT: store i32* [[ARRAYIDX5]], i32** [[F]], align 8, !dbg [[DBG73]] -// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[G]], metadata [[META75:![0-9]+]], metadata !DIExpression()), !dbg [[DBG76:![0-9]+]] -// CHECK-NEXT: store i32* [[A_ADDR]], i32** [[G]], align 8, !dbg [[DBG76]] -// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[H]], metadata [[META77:![0-9]+]], metadata !DIExpression()), !dbg [[DBG78:![0-9]+]] -// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[B3]], i64 0, i64 1, !dbg [[DBG79:![0-9]+]] -// CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX6]], i64 0, i64 1, !dbg [[DBG79]] -// CHECK-NEXT: store i32* [[ARRAYIDX7]], i32** [[H]], align 8, !dbg [[DBG78]] -// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32* [[D]], metadata [[META80:![0-9]+]], metadata !DIExpression()), !dbg [[DBG81:![0-9]+]] -// CHECK-NEXT: store i32 15, i32* [[D]], align 4, !dbg [[DBG81]] -// CHECK-NEXT: store i32 5, i32* [[A_ADDR]], align 4, !dbg [[DBG82:![0-9]+]] -// CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[B3]], i64 0, i64 0, !dbg [[DBG83:![0-9]+]] -// CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg [[DBG84:![0-9]+]] -// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP10]] to i64, !dbg [[DBG83]] -// CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX8]], i64 0, i64 [[IDXPROM]], !dbg [[DBG83]] -// CHECK-NEXT: store i32 10, i32* [[ARRAYIDX9]], align 4, !dbg [[DBG85:![0-9]+]] -// CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], [10 x [10 x [10 x i32]]]* [[TMP2]], i64 0, i64 0, !dbg [[DBG86:![0-9]+]] -// CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[ARRAYIDX10]], i64 0, i64 0, !dbg [[DBG86]] -// CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg [[DBG87:![0-9]+]] -// CHECK-NEXT: [[IDXPROM12:%.*]] = sext i32 [[TMP11]] to i64, !dbg [[DBG86]] -// CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX11]], i64 0, i64 [[IDXPROM12]], !dbg [[DBG86]] -// CHECK-NEXT: store i32 11, i32* [[ARRAYIDX13]], align 4, !dbg [[DBG88:![0-9]+]] -// CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], [10 x [10 x [10 x i32]]]* [[TMP2]], i64 0, i64 0, !dbg [[DBG89:![0-9]+]] -// CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[ARRAYIDX14]], i64 0, i64 0, !dbg [[DBG89]] -// CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg [[DBG90:![0-9]+]] -// CHECK-NEXT: [[IDXPROM16:%.*]] = sext i32 [[TMP12]] to i64, !dbg [[DBG89]] -// CHECK-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX15]], i64 0, i64 [[IDXPROM16]], !dbg [[DBG89]] -// CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX17]], align 4, !dbg [[DBG89]] -// CHECK-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[B3]], i64 0, i64 0, !dbg [[DBG91:![0-9]+]] -// CHECK-NEXT: [[TMP14:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg [[DBG92:![0-9]+]] -// CHECK-NEXT: [[IDXPROM19:%.*]] = sext i32 [[TMP14]] to i64, !dbg [[DBG91]] -// CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX18]], i64 0, i64 [[IDXPROM19]], !dbg [[DBG91]] -// CHECK-NEXT: store i32 [[TMP13]], i32* [[ARRAYIDX20]], align 4, !dbg [[DBG93:![0-9]+]] -// CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[B3]], i64 0, i64 0, !dbg [[DBG94:![0-9]+]] -// CHECK-NEXT: [[TMP15:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg [[DBG95:![0-9]+]] -// CHECK-NEXT: [[IDXPROM22:%.*]] = sext i32 [[TMP15]] to i64, !dbg [[DBG94]] -// CHECK-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX21]], i64 0, i64 [[IDXPROM22]], !dbg [[DBG94]] -// CHECK-NEXT: [[TMP16:%.*]] = load i32, i32* [[ARRAYIDX23]], align 4, !dbg [[DBG94]] -// CHECK-NEXT: [[TMP17:%.*]] = load i8, i8* [[TMP7]], align 1, !dbg [[DBG96:![0-9]+]] -// CHECK-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP17]] to i1, !dbg [[DBG96]] -// CHECK-NEXT: [[CONV:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG96]] -// CHECK-NEXT: [[OR:%.*]] = or i32 [[CONV]], [[TMP16]], !dbg [[DBG96]] -// CHECK-NEXT: [[TOBOOL24:%.*]] = icmp ne i32 [[OR]], 0, !dbg [[DBG96]] -// CHECK-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL24]] to i8, !dbg [[DBG96]] -// CHECK-NEXT: store i8 [[FROMBOOL]], i8* [[TMP7]], align 1, !dbg [[DBG96]] -// CHECK-NEXT: ret void, !dbg [[DBG97:![0-9]+]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i8 addrspace(1)** [[BB_ADDR]], metadata [[META67:![0-9]+]], metadata !DIExpression()), !dbg [[DBG68:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = load [10 x [10 x [10 x i32]]] addrspace(1)*, [10 x [10 x [10 x i32]]] addrspace(1)** [[C_ADDR]], align 8, !dbg [[DBG69:![0-9]+]] +// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast [10 x [10 x [10 x i32]]] addrspace(1)* [[TMP0]] to [10 x [10 x [10 x i32]]]*, !dbg [[DBG69]] +// CHECK-NEXT: store [10 x [10 x [10 x i32]]]* [[TMP1]], [10 x [10 x [10 x i32]]]** [[TMP]], align 8, !dbg [[DBG69]] +// CHECK-NEXT: [[TMP2:%.*]] = load [10 x [10 x [10 x i32]]]*, [10 x [10 x [10 x i32]]]** [[TMP]], align 8, !dbg [[DBG69]] +// CHECK-NEXT: [[TMP3:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[B_ADDR]], align 8, !dbg [[DBG69]] +// CHECK-NEXT: store [10 x [10 x i32]]* [[TMP3]], [10 x [10 x i32]]** [[_TMP1]], align 8, !dbg [[DBG69]] +// CHECK-NEXT: [[TMP4:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[_TMP1]], align 8, !dbg [[DBG69]] +// CHECK-NEXT: [[TMP5:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)** [[BB_ADDR]], align 8, !dbg [[DBG69]] +// CHECK-NEXT: [[TMP6:%.*]] = addrspacecast i8 addrspace(1)* [[TMP5]] to i8*, !dbg [[DBG69]] +// CHECK-NEXT: store i8* [[TMP6]], i8** [[_TMP2]], align 8, !dbg [[DBG69]] +// CHECK-NEXT: [[TMP7:%.*]] = load i8*, i8** [[_TMP2]], align 8, !dbg [[DBG69]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x i32]]* [[B3]], metadata [[META70:![0-9]+]], metadata !DIExpression()), !dbg [[DBG59]] +// CHECK-NEXT: [[TMP8:%.*]] = bitcast [10 x [10 x i32]]* [[B3]] to i8*, !dbg [[DBG69]] +// CHECK-NEXT: [[TMP9:%.*]] = bitcast [10 x [10 x i32]]* [[TMP4]] to i8*, !dbg [[DBG69]] +// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP8]], i8* align 4 [[TMP9]], i64 400, i1 false), !dbg [[DBG69]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[F]], metadata [[META71:![0-9]+]], metadata !DIExpression()), !dbg [[DBG74:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], [10 x [10 x [10 x i32]]]* [[TMP2]], i64 0, i64 1, !dbg [[DBG75:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG75]] +// CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX4]], i64 0, i64 1, !dbg [[DBG75]] +// CHECK-NEXT: store i32* [[ARRAYIDX5]], i32** [[F]], align 8, !dbg [[DBG74]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[G]], metadata [[META76:![0-9]+]], metadata !DIExpression()), !dbg [[DBG77:![0-9]+]] +// CHECK-NEXT: store i32* [[A_ADDR]], i32** [[G]], align 8, !dbg [[DBG77]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[H]], metadata [[META78:![0-9]+]], metadata !DIExpression()), !dbg [[DBG79:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[B3]], i64 0, i64 1, !dbg [[DBG80:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX6]], i64 0, i64 1, !dbg [[DBG80]] +// CHECK-NEXT: store i32* [[ARRAYIDX7]], i32** [[H]], align 8, !dbg [[DBG79]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32* [[D]], metadata [[META81:![0-9]+]], metadata !DIExpression()), !dbg [[DBG82:![0-9]+]] +// CHECK-NEXT: store i32 15, i32* [[D]], align 4, !dbg [[DBG82]] +// CHECK-NEXT: store i32 5, i32* [[A_ADDR]], align 4, !dbg [[DBG83:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[B3]], i64 0, i64 0, !dbg [[DBG84:![0-9]+]] +// CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg [[DBG85:![0-9]+]] +// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP10]] to i64, !dbg [[DBG84]] +// CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX8]], i64 0, i64 [[IDXPROM]], !dbg [[DBG84]] +// CHECK-NEXT: store i32 10, i32* [[ARRAYIDX9]], align 4, !dbg [[DBG86:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], [10 x [10 x [10 x i32]]]* [[TMP2]], i64 0, i64 0, !dbg [[DBG87:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[ARRAYIDX10]], i64 0, i64 0, !dbg [[DBG87]] +// CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg [[DBG88:![0-9]+]] +// CHECK-NEXT: [[IDXPROM12:%.*]] = sext i32 [[TMP11]] to i64, !dbg [[DBG87]] +// CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX11]], i64 0, i64 [[IDXPROM12]], !dbg [[DBG87]] +// CHECK-NEXT: store i32 11, i32* [[ARRAYIDX13]], align 4, !dbg [[DBG89:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], [10 x [10 x [10 x i32]]]* [[TMP2]], i64 0, i64 0, !dbg [[DBG90:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[ARRAYIDX14]], i64 0, i64 0, !dbg [[DBG90]] +// CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg [[DBG91:![0-9]+]] +// CHECK-NEXT: [[IDXPROM16:%.*]] = sext i32 [[TMP12]] to i64, !dbg [[DBG90]] +// CHECK-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX15]], i64 0, i64 [[IDXPROM16]], !dbg [[DBG90]] +// CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX17]], align 4, !dbg [[DBG90]] +// CHECK-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[B3]], i64 0, i64 0, !dbg [[DBG92:![0-9]+]] +// CHECK-NEXT: [[TMP14:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg [[DBG93:![0-9]+]] +// CHECK-NEXT: [[IDXPROM19:%.*]] = sext i32 [[TMP14]] to i64, !dbg [[DBG92]] +// CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX18]], i64 0, i64 [[IDXPROM19]], !dbg [[DBG92]] +// CHECK-NEXT: store i32 [[TMP13]], i32* [[ARRAYIDX20]], align 4, !dbg [[DBG94:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[B3]], i64 0, i64 0, !dbg [[DBG95:![0-9]+]] +// CHECK-NEXT: [[TMP15:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg [[DBG96:![0-9]+]] +// CHECK-NEXT: [[IDXPROM22:%.*]] = sext i32 [[TMP15]] to i64, !dbg [[DBG95]] +// CHECK-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX21]], i64 0, i64 [[IDXPROM22]], !dbg [[DBG95]] +// CHECK-NEXT: [[TMP16:%.*]] = load i32, i32* [[ARRAYIDX23]], align 4, !dbg [[DBG95]] +// CHECK-NEXT: [[TMP17:%.*]] = load i8, i8* [[TMP7]], align 1, !dbg [[DBG97:![0-9]+]] +// CHECK-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP17]] to i1, !dbg [[DBG97]] +// CHECK-NEXT: [[CONV:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG97]] +// CHECK-NEXT: [[OR:%.*]] = or i32 [[CONV]], [[TMP16]], !dbg [[DBG97]] +// CHECK-NEXT: [[TOBOOL24:%.*]] = icmp ne i32 [[OR]], 0, !dbg [[DBG97]] +// CHECK-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL24]] to i8, !dbg [[DBG97]] +// CHECK-NEXT: store i8 [[FROMBOOL]], i8* [[TMP7]], align 1, !dbg [[DBG97]] +// CHECK-NEXT: ret void, !dbg [[DBG98:![0-9]+]] // // // CHECK-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x [10 x i32]]]* nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 [[A:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[B:%.*]], i8* nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG98:![0-9]+]] { +// CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x [10 x i32]]]* nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 [[A:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[B:%.*]], i8* nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG99:![0-9]+]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 // CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 @@ -231,64 +230,64 @@ // CHECK-NEXT: [[B_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 // CHECK-NEXT: [[BB_ADDR:%.*]] = alloca i8*, align 8 // CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[DOTGLOBAL_TID__ADDR]], metadata [[META105:![0-9]+]], metadata !DIExpression()), !dbg [[DBG106:![0-9]+]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[DOTGLOBAL_TID__ADDR]], metadata [[META106:![0-9]+]], metadata !DIExpression()), !dbg [[DBG107:![0-9]+]] // CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[DOTBOUND_TID__ADDR]], metadata [[META107:![0-9]+]], metadata !DIExpression()), !dbg [[DBG106]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[DOTBOUND_TID__ADDR]], metadata [[META108:![0-9]+]], metadata !DIExpression()), !dbg [[DBG107]] // CHECK-NEXT: store [10 x [10 x [10 x i32]]]* [[C]], [10 x [10 x [10 x i32]]]** [[C_ADDR]], align 8 -// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x [10 x i32]]]** [[C_ADDR]], metadata [[META108:![0-9]+]], metadata !DIExpression()), !dbg [[DBG106]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x [10 x i32]]]** [[C_ADDR]], metadata [[META109:![0-9]+]], metadata !DIExpression()), !dbg [[DBG107]] // CHECK-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 -// CHECK-NEXT: call void @llvm.dbg.declare(metadata i64* [[A_ADDR]], metadata [[META109:![0-9]+]], metadata !DIExpression()), !dbg [[DBG106]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i64* [[A_ADDR]], metadata [[META110:![0-9]+]], metadata !DIExpression()), !dbg [[DBG107]] // CHECK-NEXT: store [10 x [10 x i32]]* [[B]], [10 x [10 x i32]]** [[B_ADDR]], align 8 -// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x i32]]** [[B_ADDR]], metadata [[META110:![0-9]+]], metadata !DIExpression()), !dbg [[DBG106]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x i32]]** [[B_ADDR]], metadata [[META111:![0-9]+]], metadata !DIExpression()), !dbg [[DBG107]] // CHECK-NEXT: store i8* [[BB]], i8** [[BB_ADDR]], align 8 -// CHECK-NEXT: call void @llvm.dbg.declare(metadata i8** [[BB_ADDR]], metadata [[META111:![0-9]+]], metadata !DIExpression()), !dbg [[DBG106]] -// CHECK-NEXT: [[TMP0:%.*]] = load [10 x [10 x [10 x i32]]]*, [10 x [10 x [10 x i32]]]** [[C_ADDR]], align 8, !dbg [[DBG112:![0-9]+]] -// CHECK-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32*, !dbg [[DBG112]] -// CHECK-NEXT: [[TMP1:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[B_ADDR]], align 8, !dbg [[DBG112]] -// CHECK-NEXT: [[TMP2:%.*]] = load i8*, i8** [[BB_ADDR]], align 8, !dbg [[DBG112]] -// CHECK-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG112]] -// CHECK-NEXT: [[TMP4:%.*]] = load i32*, i32** [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG112]] -// CHECK-NEXT: [[TMP5:%.*]] = load [10 x [10 x [10 x i32]]]*, [10 x [10 x [10 x i32]]]** [[C_ADDR]], align 8, !dbg [[DBG112]] -// CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[CONV]], align 8, !dbg [[DBG112]] -// CHECK-NEXT: [[TMP7:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[B_ADDR]], align 8, !dbg [[DBG112]] -// CHECK-NEXT: [[TMP8:%.*]] = load i8*, i8** [[BB_ADDR]], align 8, !dbg [[DBG112]] -// CHECK-NEXT: [[TMP9:%.*]] = addrspacecast [10 x [10 x [10 x i32]]]* [[TMP5]] to [10 x [10 x [10 x i32]]] addrspace(1)*, !dbg [[DBG112]] -// CHECK-NEXT: [[TMP10:%.*]] = addrspacecast i8* [[TMP8]] to i8 addrspace(1)*, !dbg [[DBG112]] -// CHECK-NEXT: call void @__omp_outlined___debug__(i32* [[TMP3]], i32* [[TMP4]], [10 x [10 x [10 x i32]]] addrspace(1)* [[TMP9]], i32 [[TMP6]], [10 x [10 x i32]]* [[TMP7]], i8 addrspace(1)* [[TMP10]]) #[[ATTR4:[0-9]+]], !dbg [[DBG112]] -// CHECK-NEXT: ret void, !dbg [[DBG112]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i8** [[BB_ADDR]], metadata [[META112:![0-9]+]], metadata !DIExpression()), !dbg [[DBG107]] +// CHECK-NEXT: [[TMP0:%.*]] = load [10 x [10 x [10 x i32]]]*, [10 x [10 x [10 x i32]]]** [[C_ADDR]], align 8, !dbg [[DBG113:![0-9]+]] +// CHECK-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32*, !dbg [[DBG113]] +// CHECK-NEXT: [[TMP1:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[B_ADDR]], align 8, !dbg [[DBG113]] +// CHECK-NEXT: [[TMP2:%.*]] = load i8*, i8** [[BB_ADDR]], align 8, !dbg [[DBG113]] +// CHECK-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG113]] +// CHECK-NEXT: [[TMP4:%.*]] = load i32*, i32** [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG113]] +// CHECK-NEXT: [[TMP5:%.*]] = load [10 x [10 x [10 x i32]]]*, [10 x [10 x [10 x i32]]]** [[C_ADDR]], align 8, !dbg [[DBG113]] +// CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[CONV]], align 8, !dbg [[DBG113]] +// CHECK-NEXT: [[TMP7:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[B_ADDR]], align 8, !dbg [[DBG113]] +// CHECK-NEXT: [[TMP8:%.*]] = load i8*, i8** [[BB_ADDR]], align 8, !dbg [[DBG113]] +// CHECK-NEXT: [[TMP9:%.*]] = addrspacecast [10 x [10 x [10 x i32]]]* [[TMP5]] to [10 x [10 x [10 x i32]]] addrspace(1)*, !dbg [[DBG113]] +// CHECK-NEXT: [[TMP10:%.*]] = addrspacecast i8* [[TMP8]] to i8 addrspace(1)*, !dbg [[DBG113]] +// CHECK-NEXT: call void @__omp_outlined___debug__(i32* [[TMP3]], i32* [[TMP4]], [10 x [10 x [10 x i32]]] addrspace(1)* [[TMP9]], i32 [[TMP6]], [10 x [10 x i32]]* [[TMP7]], i8 addrspace(1)* [[TMP10]]) #[[ATTR4:[0-9]+]], !dbg [[DBG113]] +// CHECK-NEXT: ret void, !dbg [[DBG113]] // // // CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23 -// CHECK-SAME: ([10 x [10 x [10 x i32]]]* nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 [[A:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[B:%.*]], i8* nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG113:![0-9]+]] { +// CHECK-SAME: ([10 x [10 x [10 x i32]]]* nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 [[A:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[B:%.*]], i8* nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG114:![0-9]+]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x [10 x i32]]]*, align 8 // CHECK-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 // CHECK-NEXT: [[B_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 // CHECK-NEXT: [[BB_ADDR:%.*]] = alloca i8*, align 8 // CHECK-NEXT: store [10 x [10 x [10 x i32]]]* [[C]], [10 x [10 x [10 x i32]]]** [[C_ADDR]], align 8 -// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x [10 x i32]]]** [[C_ADDR]], metadata [[META116:![0-9]+]], metadata !DIExpression()), !dbg [[DBG117:![0-9]+]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x [10 x i32]]]** [[C_ADDR]], metadata [[META117:![0-9]+]], metadata !DIExpression()), !dbg [[DBG118:![0-9]+]] // CHECK-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 -// CHECK-NEXT: call void @llvm.dbg.declare(metadata i64* [[A_ADDR]], metadata [[META118:![0-9]+]], metadata !DIExpression()), !dbg [[DBG117]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i64* [[A_ADDR]], metadata [[META119:![0-9]+]], metadata !DIExpression()), !dbg [[DBG118]] // CHECK-NEXT: store [10 x [10 x i32]]* [[B]], [10 x [10 x i32]]** [[B_ADDR]], align 8 -// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x i32]]** [[B_ADDR]], metadata [[META119:![0-9]+]], metadata !DIExpression()), !dbg [[DBG117]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x i32]]** [[B_ADDR]], metadata [[META120:![0-9]+]], metadata !DIExpression()), !dbg [[DBG118]] // CHECK-NEXT: store i8* [[BB]], i8** [[BB_ADDR]], align 8 -// CHECK-NEXT: call void @llvm.dbg.declare(metadata i8** [[BB_ADDR]], metadata [[META120:![0-9]+]], metadata !DIExpression()), !dbg [[DBG117]] -// CHECK-NEXT: [[TMP0:%.*]] = load [10 x [10 x [10 x i32]]]*, [10 x [10 x [10 x i32]]]** [[C_ADDR]], align 8, !dbg [[DBG121:![0-9]+]] -// CHECK-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32*, !dbg [[DBG121]] -// CHECK-NEXT: [[TMP1:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[B_ADDR]], align 8, !dbg [[DBG121]] -// CHECK-NEXT: [[TMP2:%.*]] = load i8*, i8** [[BB_ADDR]], align 8, !dbg [[DBG121]] -// CHECK-NEXT: [[TMP3:%.*]] = load [10 x [10 x [10 x i32]]]*, [10 x [10 x [10 x i32]]]** [[C_ADDR]], align 8, !dbg [[DBG121]] -// CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[CONV]], align 8, !dbg [[DBG121]] -// CHECK-NEXT: [[TMP5:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[B_ADDR]], align 8, !dbg [[DBG121]] -// CHECK-NEXT: [[TMP6:%.*]] = load i8*, i8** [[BB_ADDR]], align 8, !dbg [[DBG121]] -// CHECK-NEXT: [[TMP7:%.*]] = addrspacecast [10 x [10 x [10 x i32]]]* [[TMP3]] to [10 x [10 x [10 x i32]]] addrspace(1)*, !dbg [[DBG121]] -// CHECK-NEXT: [[TMP8:%.*]] = addrspacecast i8* [[TMP6]] to i8 addrspace(1)*, !dbg [[DBG121]] -// CHECK-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_debug__([10 x [10 x [10 x i32]]] addrspace(1)* [[TMP7]], i32 [[TMP4]], [10 x [10 x i32]]* [[TMP5]], i8 addrspace(1)* [[TMP8]]) #[[ATTR4]], !dbg [[DBG121]] -// CHECK-NEXT: ret void, !dbg [[DBG121]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i8** [[BB_ADDR]], metadata [[META121:![0-9]+]], metadata !DIExpression()), !dbg [[DBG118]] +// CHECK-NEXT: [[TMP0:%.*]] = load [10 x [10 x [10 x i32]]]*, [10 x [10 x [10 x i32]]]** [[C_ADDR]], align 8, !dbg [[DBG122:![0-9]+]] +// CHECK-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32*, !dbg [[DBG122]] +// CHECK-NEXT: [[TMP1:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[B_ADDR]], align 8, !dbg [[DBG122]] +// CHECK-NEXT: [[TMP2:%.*]] = load i8*, i8** [[BB_ADDR]], align 8, !dbg [[DBG122]] +// CHECK-NEXT: [[TMP3:%.*]] = load [10 x [10 x [10 x i32]]]*, [10 x [10 x [10 x i32]]]** [[C_ADDR]], align 8, !dbg [[DBG122]] +// CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[CONV]], align 8, !dbg [[DBG122]] +// CHECK-NEXT: [[TMP5:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[B_ADDR]], align 8, !dbg [[DBG122]] +// CHECK-NEXT: [[TMP6:%.*]] = load i8*, i8** [[BB_ADDR]], align 8, !dbg [[DBG122]] +// CHECK-NEXT: [[TMP7:%.*]] = addrspacecast [10 x [10 x [10 x i32]]]* [[TMP3]] to [10 x [10 x [10 x i32]]] addrspace(1)*, !dbg [[DBG122]] +// CHECK-NEXT: [[TMP8:%.*]] = addrspacecast i8* [[TMP6]] to i8 addrspace(1)*, !dbg [[DBG122]] +// CHECK-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_debug__([10 x [10 x [10 x i32]]] addrspace(1)* [[TMP7]], i32 [[TMP4]], [10 x [10 x i32]]* [[TMP5]], i8 addrspace(1)* [[TMP8]]) #[[ATTR4]], !dbg [[DBG122]] +// CHECK-NEXT: ret void, !dbg [[DBG122]] // // // CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37_debug__ -// CHECK-SAME: ([10 x [10 x [10 x i32]]] addrspace(1)* noalias [[C:%.*]], i32 [[A:%.*]], [10 x [10 x i32]] addrspace(1)* noalias [[B:%.*]], i8 addrspace(1)* noalias [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG122:![0-9]+]] { +// CHECK-SAME: ([10 x [10 x [10 x i32]]] addrspace(1)* noalias [[C:%.*]], i32 [[A:%.*]], [10 x [10 x i32]] addrspace(1)* noalias [[B:%.*]], i8 addrspace(1)* noalias [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG123:![0-9]+]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x [10 x i32]]] addrspace(1)*, align 8 // CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 @@ -300,58 +299,57 @@ // CHECK-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8 // CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 // CHECK-NEXT: store [10 x [10 x [10 x i32]]] addrspace(1)* [[C]], [10 x [10 x [10 x i32]]] addrspace(1)** [[C_ADDR]], align 8 -// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x [10 x i32]]] addrspace(1)** [[C_ADDR]], metadata [[META127:![0-9]+]], metadata !DIExpression()), !dbg [[DBG128:![0-9]+]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x [10 x i32]]] addrspace(1)** [[C_ADDR]], metadata [[META128:![0-9]+]], metadata !DIExpression()), !dbg [[DBG129:![0-9]+]] // CHECK-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 -// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32* [[A_ADDR]], metadata [[META129:![0-9]+]], metadata !DIExpression()), !dbg [[DBG130:![0-9]+]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32* [[A_ADDR]], metadata [[META130:![0-9]+]], metadata !DIExpression()), !dbg [[DBG131:![0-9]+]] // CHECK-NEXT: store [10 x [10 x i32]] addrspace(1)* [[B]], [10 x [10 x i32]] addrspace(1)** [[B_ADDR]], align 8 -// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x i32]] addrspace(1)** [[B_ADDR]], metadata [[META131:![0-9]+]], metadata !DIExpression()), !dbg [[DBG132:![0-9]+]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x i32]] addrspace(1)** [[B_ADDR]], metadata [[META132:![0-9]+]], metadata !DIExpression()), !dbg [[DBG133:![0-9]+]] // CHECK-NEXT: store i8 addrspace(1)* [[BB]], i8 addrspace(1)** [[BB_ADDR]], align 8 -// CHECK-NEXT: call void @llvm.dbg.declare(metadata i8 addrspace(1)** [[BB_ADDR]], metadata [[META133:![0-9]+]], metadata !DIExpression()), !dbg [[DBG134:![0-9]+]] -// CHECK-NEXT: [[TMP0:%.*]] = load [10 x [10 x [10 x i32]]] addrspace(1)*, [10 x [10 x [10 x i32]]] addrspace(1)** [[C_ADDR]], align 8, !dbg [[DBG135:![0-9]+]] -// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast [10 x [10 x [10 x i32]]] addrspace(1)* [[TMP0]] to [10 x [10 x [10 x i32]]]*, !dbg [[DBG135]] -// CHECK-NEXT: store [10 x [10 x [10 x i32]]]* [[TMP1]], [10 x [10 x [10 x i32]]]** [[TMP]], align 8, !dbg [[DBG135]] -// CHECK-NEXT: [[TMP2:%.*]] = load [10 x [10 x [10 x i32]]]*, [10 x [10 x [10 x i32]]]** [[TMP]], align 8, !dbg [[DBG135]] -// CHECK-NEXT: [[TMP3:%.*]] = load [10 x [10 x i32]] addrspace(1)*, [10 x [10 x i32]] addrspace(1)** [[B_ADDR]], align 8, !dbg [[DBG135]] -// CHECK-NEXT: [[TMP4:%.*]] = addrspacecast [10 x [10 x i32]] addrspace(1)* [[TMP3]] to [10 x [10 x i32]]*, !dbg [[DBG135]] -// CHECK-NEXT: store [10 x [10 x i32]]* [[TMP4]], [10 x [10 x i32]]** [[_TMP1]], align 8, !dbg [[DBG135]] -// CHECK-NEXT: [[TMP5:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[_TMP1]], align 8, !dbg [[DBG135]] -// CHECK-NEXT: [[TMP6:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)** [[BB_ADDR]], align 8, !dbg [[DBG135]] -// CHECK-NEXT: [[TMP7:%.*]] = addrspacecast i8 addrspace(1)* [[TMP6]] to i8*, !dbg [[DBG135]] -// CHECK-NEXT: store i8* [[TMP7]], i8** [[_TMP2]], align 8, !dbg [[DBG135]] -// CHECK-NEXT: [[TMP8:%.*]] = load i8*, i8** [[_TMP2]], align 8, !dbg [[DBG135]] -// CHECK-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !dbg [[DBG135]] -// CHECK-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1), !dbg [[DBG135]] -// CHECK-NEXT: call void @__kmpc_data_sharing_init_stack_spmd(), !dbg [[DBG135]] -// CHECK-NEXT: br label [[DOTEXECUTE:%.*]], !dbg [[DBG135]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i8 addrspace(1)** [[BB_ADDR]], metadata [[META134:![0-9]+]], metadata !DIExpression()), !dbg [[DBG135:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = load [10 x [10 x [10 x i32]]] addrspace(1)*, [10 x [10 x [10 x i32]]] addrspace(1)** [[C_ADDR]], align 8, !dbg [[DBG136:![0-9]+]] +// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast [10 x [10 x [10 x i32]]] addrspace(1)* [[TMP0]] to [10 x [10 x [10 x i32]]]*, !dbg [[DBG136]] +// CHECK-NEXT: store [10 x [10 x [10 x i32]]]* [[TMP1]], [10 x [10 x [10 x i32]]]** [[TMP]], align 8, !dbg [[DBG136]] +// CHECK-NEXT: [[TMP2:%.*]] = load [10 x [10 x [10 x i32]]]*, [10 x [10 x [10 x i32]]]** [[TMP]], align 8, !dbg [[DBG136]] +// CHECK-NEXT: [[TMP3:%.*]] = load [10 x [10 x i32]] addrspace(1)*, [10 x [10 x i32]] addrspace(1)** [[B_ADDR]], align 8, !dbg [[DBG136]] +// CHECK-NEXT: [[TMP4:%.*]] = addrspacecast [10 x [10 x i32]] addrspace(1)* [[TMP3]] to [10 x [10 x i32]]*, !dbg [[DBG136]] +// CHECK-NEXT: store [10 x [10 x i32]]* [[TMP4]], [10 x [10 x i32]]** [[_TMP1]], align 8, !dbg [[DBG136]] +// CHECK-NEXT: [[TMP5:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[_TMP1]], align 8, !dbg [[DBG136]] +// CHECK-NEXT: [[TMP6:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)** [[BB_ADDR]], align 8, !dbg [[DBG136]] +// CHECK-NEXT: [[TMP7:%.*]] = addrspacecast i8 addrspace(1)* [[TMP6]] to i8*, !dbg [[DBG136]] +// CHECK-NEXT: store i8* [[TMP7]], i8** [[_TMP2]], align 8, !dbg [[DBG136]] +// CHECK-NEXT: [[TMP8:%.*]] = load i8*, i8** [[_TMP2]], align 8, !dbg [[DBG136]] +// CHECK-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !dbg [[DBG136]], !range [[RNG46]] +// CHECK-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1), !dbg [[DBG136]] +// CHECK-NEXT: br label [[DOTEXECUTE:%.*]], !dbg [[DBG136]] // CHECK: .execute: // CHECK-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) -// CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg [[DBG136:![0-9]+]] -// CHECK-NEXT: [[CONV:%.*]] = bitcast i64* [[A_CASTED]] to i32*, !dbg [[DBG136]] -// CHECK-NEXT: store i32 [[TMP10]], i32* [[CONV]], align 4, !dbg [[DBG136]] -// CHECK-NEXT: [[TMP11:%.*]] = load i64, i64* [[A_CASTED]], align 8, !dbg [[DBG136]] -// CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG136]] -// CHECK-NEXT: [[TMP13:%.*]] = bitcast [10 x [10 x [10 x i32]]]* [[TMP2]] to i8*, !dbg [[DBG136]] -// CHECK-NEXT: store i8* [[TMP13]], i8** [[TMP12]], align 8, !dbg [[DBG136]] -// CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG136]] -// CHECK-NEXT: [[TMP15:%.*]] = inttoptr i64 [[TMP11]] to i8*, !dbg [[DBG136]] -// CHECK-NEXT: store i8* [[TMP15]], i8** [[TMP14]], align 8, !dbg [[DBG136]] -// CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG136]] -// CHECK-NEXT: [[TMP17:%.*]] = bitcast [10 x [10 x i32]]* [[TMP5]] to i8*, !dbg [[DBG136]] -// CHECK-NEXT: store i8* [[TMP17]], i8** [[TMP16]], align 8, !dbg [[DBG136]] -// CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG136]] -// CHECK-NEXT: store i8* [[TMP8]], i8** [[TMP18]], align 8, !dbg [[DBG136]] -// CHECK-NEXT: [[TMP19:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**, !dbg [[DBG136]] -// CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP9]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, [10 x [10 x [10 x i32]]]*, i64, [10 x [10 x i32]]*, i8*)* @__omp_outlined__2 to i8*), i8* null, i8** [[TMP19]], i64 4), !dbg [[DBG136]] -// CHECK-NEXT: br label [[DOTOMP_DEINIT:%.*]], !dbg [[DBG137:![0-9]+]] +// CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg [[DBG137:![0-9]+]] +// CHECK-NEXT: [[CONV:%.*]] = bitcast i64* [[A_CASTED]] to i32*, !dbg [[DBG137]] +// CHECK-NEXT: store i32 [[TMP10]], i32* [[CONV]], align 4, !dbg [[DBG137]] +// CHECK-NEXT: [[TMP11:%.*]] = load i64, i64* [[A_CASTED]], align 8, !dbg [[DBG137]] +// CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG137]] +// CHECK-NEXT: [[TMP13:%.*]] = bitcast [10 x [10 x [10 x i32]]]* [[TMP2]] to i8*, !dbg [[DBG137]] +// CHECK-NEXT: store i8* [[TMP13]], i8** [[TMP12]], align 8, !dbg [[DBG137]] +// CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG137]] +// CHECK-NEXT: [[TMP15:%.*]] = inttoptr i64 [[TMP11]] to i8*, !dbg [[DBG137]] +// CHECK-NEXT: store i8* [[TMP15]], i8** [[TMP14]], align 8, !dbg [[DBG137]] +// CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG137]] +// CHECK-NEXT: [[TMP17:%.*]] = bitcast [10 x [10 x i32]]* [[TMP5]] to i8*, !dbg [[DBG137]] +// CHECK-NEXT: store i8* [[TMP17]], i8** [[TMP16]], align 8, !dbg [[DBG137]] +// CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG137]] +// CHECK-NEXT: store i8* [[TMP8]], i8** [[TMP18]], align 8, !dbg [[DBG137]] +// CHECK-NEXT: [[TMP19:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**, !dbg [[DBG137]] +// CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP9]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, [10 x [10 x [10 x i32]]]*, i64, [10 x [10 x i32]]*, i8*)* @__omp_outlined__2 to i8*), i8* null, i8** [[TMP19]], i64 4), !dbg [[DBG137]] +// CHECK-NEXT: br label [[DOTOMP_DEINIT:%.*]], !dbg [[DBG138:![0-9]+]] // CHECK: .omp.deinit: -// CHECK-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1), !dbg [[DBG137]] -// CHECK-NEXT: br label [[DOTEXIT:%.*]], !dbg [[DBG137]] +// CHECK-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1), !dbg [[DBG138]] +// CHECK-NEXT: br label [[DOTEXIT:%.*]], !dbg [[DBG138]] // CHECK: .exit: -// CHECK-NEXT: ret void, !dbg [[DBG139:![0-9]+]] +// CHECK-NEXT: ret void, !dbg [[DBG140:![0-9]+]] // // // CHECK-LABEL: define {{[^@]+}}@__omp_outlined___debug__1 -// CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x [10 x i32]]] addrspace(1)* noalias [[C:%.*]], i32 [[A:%.*]], [10 x [10 x i32]] addrspace(1)* noalias [[B:%.*]], i8 addrspace(1)* noalias [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG140:![0-9]+]] { +// CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x [10 x i32]]] addrspace(1)* noalias [[C:%.*]], i32 [[A:%.*]], [10 x [10 x i32]] addrspace(1)* noalias [[B:%.*]], i8 addrspace(1)* noalias [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG141:![0-9]+]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 // CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 @@ -367,74 +365,74 @@ // CHECK-NEXT: [[H:%.*]] = alloca i32*, align 8 // CHECK-NEXT: [[D:%.*]] = alloca i32, align 4 // CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[DOTGLOBAL_TID__ADDR]], metadata [[META143:![0-9]+]], metadata !DIExpression()), !dbg [[DBG144:![0-9]+]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[DOTGLOBAL_TID__ADDR]], metadata [[META144:![0-9]+]], metadata !DIExpression()), !dbg [[DBG145:![0-9]+]] // CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[DOTBOUND_TID__ADDR]], metadata [[META145:![0-9]+]], metadata !DIExpression()), !dbg [[DBG144]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[DOTBOUND_TID__ADDR]], metadata [[META146:![0-9]+]], metadata !DIExpression()), !dbg [[DBG145]] // CHECK-NEXT: store [10 x [10 x [10 x i32]]] addrspace(1)* [[C]], [10 x [10 x [10 x i32]]] addrspace(1)** [[C_ADDR]], align 8 -// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x [10 x i32]]] addrspace(1)** [[C_ADDR]], metadata [[META146:![0-9]+]], metadata !DIExpression()), !dbg [[DBG147:![0-9]+]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x [10 x i32]]] addrspace(1)** [[C_ADDR]], metadata [[META147:![0-9]+]], metadata !DIExpression()), !dbg [[DBG148:![0-9]+]] // CHECK-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 -// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32* [[A_ADDR]], metadata [[META148:![0-9]+]], metadata !DIExpression()), !dbg [[DBG149:![0-9]+]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32* [[A_ADDR]], metadata [[META149:![0-9]+]], metadata !DIExpression()), !dbg [[DBG150:![0-9]+]] // CHECK-NEXT: store [10 x [10 x i32]] addrspace(1)* [[B]], [10 x [10 x i32]] addrspace(1)** [[B_ADDR]], align 8 -// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x i32]] addrspace(1)** [[B_ADDR]], metadata [[META150:![0-9]+]], metadata !DIExpression()), !dbg [[DBG151:![0-9]+]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x i32]] addrspace(1)** [[B_ADDR]], metadata [[META151:![0-9]+]], metadata !DIExpression()), !dbg [[DBG152:![0-9]+]] // CHECK-NEXT: store i8 addrspace(1)* [[BB]], i8 addrspace(1)** [[BB_ADDR]], align 8 -// CHECK-NEXT: call void @llvm.dbg.declare(metadata i8 addrspace(1)** [[BB_ADDR]], metadata [[META152:![0-9]+]], metadata !DIExpression()), !dbg [[DBG153:![0-9]+]] -// CHECK-NEXT: [[TMP0:%.*]] = load [10 x [10 x [10 x i32]]] addrspace(1)*, [10 x [10 x [10 x i32]]] addrspace(1)** [[C_ADDR]], align 8, !dbg [[DBG154:![0-9]+]] -// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast [10 x [10 x [10 x i32]]] addrspace(1)* [[TMP0]] to [10 x [10 x [10 x i32]]]*, !dbg [[DBG154]] -// CHECK-NEXT: store [10 x [10 x [10 x i32]]]* [[TMP1]], [10 x [10 x [10 x i32]]]** [[TMP]], align 8, !dbg [[DBG154]] -// CHECK-NEXT: [[TMP2:%.*]] = load [10 x [10 x [10 x i32]]]*, [10 x [10 x [10 x i32]]]** [[TMP]], align 8, !dbg [[DBG154]] -// CHECK-NEXT: [[TMP3:%.*]] = load [10 x [10 x i32]] addrspace(1)*, [10 x [10 x i32]] addrspace(1)** [[B_ADDR]], align 8, !dbg [[DBG154]] -// CHECK-NEXT: [[TMP4:%.*]] = addrspacecast [10 x [10 x i32]] addrspace(1)* [[TMP3]] to [10 x [10 x i32]]*, !dbg [[DBG154]] -// CHECK-NEXT: store [10 x [10 x i32]]* [[TMP4]], [10 x [10 x i32]]** [[_TMP1]], align 8, !dbg [[DBG154]] -// CHECK-NEXT: [[TMP5:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[_TMP1]], align 8, !dbg [[DBG154]] -// CHECK-NEXT: [[TMP6:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)** [[BB_ADDR]], align 8, !dbg [[DBG154]] -// CHECK-NEXT: [[TMP7:%.*]] = addrspacecast i8 addrspace(1)* [[TMP6]] to i8*, !dbg [[DBG154]] -// CHECK-NEXT: store i8* [[TMP7]], i8** [[_TMP2]], align 8, !dbg [[DBG154]] -// CHECK-NEXT: [[TMP8:%.*]] = load i8*, i8** [[_TMP2]], align 8, !dbg [[DBG154]] -// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[F]], metadata [[META155:![0-9]+]], metadata !DIExpression()), !dbg [[DBG157:![0-9]+]] -// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], [10 x [10 x [10 x i32]]]* [[TMP2]], i64 0, i64 1, !dbg [[DBG158:![0-9]+]] -// CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG158]] -// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX3]], i64 0, i64 1, !dbg [[DBG158]] -// CHECK-NEXT: store i32* [[ARRAYIDX4]], i32** [[F]], align 8, !dbg [[DBG157]] -// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[G]], metadata [[META159:![0-9]+]], metadata !DIExpression()), !dbg [[DBG160:![0-9]+]] -// CHECK-NEXT: store i32* [[A_ADDR]], i32** [[G]], align 8, !dbg [[DBG160]] -// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[H]], metadata [[META161:![0-9]+]], metadata !DIExpression()), !dbg [[DBG162:![0-9]+]] -// CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP5]], i64 0, i64 1, !dbg [[DBG163:![0-9]+]] -// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX5]], i64 0, i64 1, !dbg [[DBG163]] -// CHECK-NEXT: store i32* [[ARRAYIDX6]], i32** [[H]], align 8, !dbg [[DBG162]] -// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32* [[D]], metadata [[META164:![0-9]+]], metadata !DIExpression()), !dbg [[DBG165:![0-9]+]] -// CHECK-NEXT: store i32 15, i32* [[D]], align 4, !dbg [[DBG165]] -// CHECK-NEXT: store i32 5, i32* [[A_ADDR]], align 4, !dbg [[DBG166:![0-9]+]] -// CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP5]], i64 0, i64 0, !dbg [[DBG167:![0-9]+]] -// CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg [[DBG168:![0-9]+]] -// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP9]] to i64, !dbg [[DBG167]] -// CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX7]], i64 0, i64 [[IDXPROM]], !dbg [[DBG167]] -// CHECK-NEXT: store i32 10, i32* [[ARRAYIDX8]], align 4, !dbg [[DBG169:![0-9]+]] -// CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], [10 x [10 x [10 x i32]]]* [[TMP2]], i64 0, i64 0, !dbg [[DBG170:![0-9]+]] -// CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[ARRAYIDX9]], i64 0, i64 0, !dbg [[DBG170]] -// CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg [[DBG171:![0-9]+]] -// CHECK-NEXT: [[IDXPROM11:%.*]] = sext i32 [[TMP10]] to i64, !dbg [[DBG170]] -// CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX10]], i64 0, i64 [[IDXPROM11]], !dbg [[DBG170]] -// CHECK-NEXT: store i32 11, i32* [[ARRAYIDX12]], align 4, !dbg [[DBG172:![0-9]+]] -// CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], [10 x [10 x [10 x i32]]]* [[TMP2]], i64 0, i64 0, !dbg [[DBG173:![0-9]+]] -// CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[ARRAYIDX13]], i64 0, i64 0, !dbg [[DBG173]] -// CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg [[DBG174:![0-9]+]] -// CHECK-NEXT: [[IDXPROM15:%.*]] = sext i32 [[TMP11]] to i64, !dbg [[DBG173]] -// CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX14]], i64 0, i64 [[IDXPROM15]], !dbg [[DBG173]] -// CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX16]], align 4, !dbg [[DBG173]] -// CHECK-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP5]], i64 0, i64 0, !dbg [[DBG175:![0-9]+]] -// CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg [[DBG176:![0-9]+]] -// CHECK-NEXT: [[IDXPROM18:%.*]] = sext i32 [[TMP13]] to i64, !dbg [[DBG175]] -// CHECK-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX17]], i64 0, i64 [[IDXPROM18]], !dbg [[DBG175]] -// CHECK-NEXT: store i32 [[TMP12]], i32* [[ARRAYIDX19]], align 4, !dbg [[DBG177:![0-9]+]] -// CHECK-NEXT: [[TMP14:%.*]] = load i8, i8* [[TMP8]], align 1, !dbg [[DBG178:![0-9]+]] -// CHECK-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP14]] to i1, !dbg [[DBG178]] -// CHECK-NEXT: [[CONV:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG178]] -// CHECK-NEXT: store i32 [[CONV]], i32* [[D]], align 4, !dbg [[DBG179:![0-9]+]] -// CHECK-NEXT: ret void, !dbg [[DBG180:![0-9]+]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i8 addrspace(1)** [[BB_ADDR]], metadata [[META153:![0-9]+]], metadata !DIExpression()), !dbg [[DBG154:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = load [10 x [10 x [10 x i32]]] addrspace(1)*, [10 x [10 x [10 x i32]]] addrspace(1)** [[C_ADDR]], align 8, !dbg [[DBG155:![0-9]+]] +// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast [10 x [10 x [10 x i32]]] addrspace(1)* [[TMP0]] to [10 x [10 x [10 x i32]]]*, !dbg [[DBG155]] +// CHECK-NEXT: store [10 x [10 x [10 x i32]]]* [[TMP1]], [10 x [10 x [10 x i32]]]** [[TMP]], align 8, !dbg [[DBG155]] +// CHECK-NEXT: [[TMP2:%.*]] = load [10 x [10 x [10 x i32]]]*, [10 x [10 x [10 x i32]]]** [[TMP]], align 8, !dbg [[DBG155]] +// CHECK-NEXT: [[TMP3:%.*]] = load [10 x [10 x i32]] addrspace(1)*, [10 x [10 x i32]] addrspace(1)** [[B_ADDR]], align 8, !dbg [[DBG155]] +// CHECK-NEXT: [[TMP4:%.*]] = addrspacecast [10 x [10 x i32]] addrspace(1)* [[TMP3]] to [10 x [10 x i32]]*, !dbg [[DBG155]] +// CHECK-NEXT: store [10 x [10 x i32]]* [[TMP4]], [10 x [10 x i32]]** [[_TMP1]], align 8, !dbg [[DBG155]] +// CHECK-NEXT: [[TMP5:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[_TMP1]], align 8, !dbg [[DBG155]] +// CHECK-NEXT: [[TMP6:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)** [[BB_ADDR]], align 8, !dbg [[DBG155]] +// CHECK-NEXT: [[TMP7:%.*]] = addrspacecast i8 addrspace(1)* [[TMP6]] to i8*, !dbg [[DBG155]] +// CHECK-NEXT: store i8* [[TMP7]], i8** [[_TMP2]], align 8, !dbg [[DBG155]] +// CHECK-NEXT: [[TMP8:%.*]] = load i8*, i8** [[_TMP2]], align 8, !dbg [[DBG155]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[F]], metadata [[META156:![0-9]+]], metadata !DIExpression()), !dbg [[DBG158:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], [10 x [10 x [10 x i32]]]* [[TMP2]], i64 0, i64 1, !dbg [[DBG159:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG159]] +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX3]], i64 0, i64 1, !dbg [[DBG159]] +// CHECK-NEXT: store i32* [[ARRAYIDX4]], i32** [[F]], align 8, !dbg [[DBG158]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[G]], metadata [[META160:![0-9]+]], metadata !DIExpression()), !dbg [[DBG161:![0-9]+]] +// CHECK-NEXT: store i32* [[A_ADDR]], i32** [[G]], align 8, !dbg [[DBG161]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[H]], metadata [[META162:![0-9]+]], metadata !DIExpression()), !dbg [[DBG163:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP5]], i64 0, i64 1, !dbg [[DBG164:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX5]], i64 0, i64 1, !dbg [[DBG164]] +// CHECK-NEXT: store i32* [[ARRAYIDX6]], i32** [[H]], align 8, !dbg [[DBG163]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32* [[D]], metadata [[META165:![0-9]+]], metadata !DIExpression()), !dbg [[DBG166:![0-9]+]] +// CHECK-NEXT: store i32 15, i32* [[D]], align 4, !dbg [[DBG166]] +// CHECK-NEXT: store i32 5, i32* [[A_ADDR]], align 4, !dbg [[DBG167:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP5]], i64 0, i64 0, !dbg [[DBG168:![0-9]+]] +// CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg [[DBG169:![0-9]+]] +// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP9]] to i64, !dbg [[DBG168]] +// CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX7]], i64 0, i64 [[IDXPROM]], !dbg [[DBG168]] +// CHECK-NEXT: store i32 10, i32* [[ARRAYIDX8]], align 4, !dbg [[DBG170:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], [10 x [10 x [10 x i32]]]* [[TMP2]], i64 0, i64 0, !dbg [[DBG171:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[ARRAYIDX9]], i64 0, i64 0, !dbg [[DBG171]] +// CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg [[DBG172:![0-9]+]] +// CHECK-NEXT: [[IDXPROM11:%.*]] = sext i32 [[TMP10]] to i64, !dbg [[DBG171]] +// CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX10]], i64 0, i64 [[IDXPROM11]], !dbg [[DBG171]] +// CHECK-NEXT: store i32 11, i32* [[ARRAYIDX12]], align 4, !dbg [[DBG173:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], [10 x [10 x [10 x i32]]]* [[TMP2]], i64 0, i64 0, !dbg [[DBG174:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[ARRAYIDX13]], i64 0, i64 0, !dbg [[DBG174]] +// CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg [[DBG175:![0-9]+]] +// CHECK-NEXT: [[IDXPROM15:%.*]] = sext i32 [[TMP11]] to i64, !dbg [[DBG174]] +// CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX14]], i64 0, i64 [[IDXPROM15]], !dbg [[DBG174]] +// CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX16]], align 4, !dbg [[DBG174]] +// CHECK-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP5]], i64 0, i64 0, !dbg [[DBG176:![0-9]+]] +// CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg [[DBG177:![0-9]+]] +// CHECK-NEXT: [[IDXPROM18:%.*]] = sext i32 [[TMP13]] to i64, !dbg [[DBG176]] +// CHECK-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX17]], i64 0, i64 [[IDXPROM18]], !dbg [[DBG176]] +// CHECK-NEXT: store i32 [[TMP12]], i32* [[ARRAYIDX19]], align 4, !dbg [[DBG178:![0-9]+]] +// CHECK-NEXT: [[TMP14:%.*]] = load i8, i8* [[TMP8]], align 1, !dbg [[DBG179:![0-9]+]] +// CHECK-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP14]] to i1, !dbg [[DBG179]] +// CHECK-NEXT: [[CONV:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG179]] +// CHECK-NEXT: store i32 [[CONV]], i32* [[D]], align 4, !dbg [[DBG180:![0-9]+]] +// CHECK-NEXT: ret void, !dbg [[DBG181:![0-9]+]] // // // CHECK-LABEL: define {{[^@]+}}@__omp_outlined__2 -// CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x [10 x i32]]]* nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 [[A:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[B:%.*]], i8* nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG181:![0-9]+]] { +// CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x [10 x i32]]]* nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 [[A:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[B:%.*]], i8* nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG182:![0-9]+]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 // CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 @@ -443,66 +441,66 @@ // CHECK-NEXT: [[B_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 // CHECK-NEXT: [[BB_ADDR:%.*]] = alloca i8*, align 8 // CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[DOTGLOBAL_TID__ADDR]], metadata [[META182:![0-9]+]], metadata !DIExpression()), !dbg [[DBG183:![0-9]+]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[DOTGLOBAL_TID__ADDR]], metadata [[META183:![0-9]+]], metadata !DIExpression()), !dbg [[DBG184:![0-9]+]] // CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[DOTBOUND_TID__ADDR]], metadata [[META184:![0-9]+]], metadata !DIExpression()), !dbg [[DBG183]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[DOTBOUND_TID__ADDR]], metadata [[META185:![0-9]+]], metadata !DIExpression()), !dbg [[DBG184]] // CHECK-NEXT: store [10 x [10 x [10 x i32]]]* [[C]], [10 x [10 x [10 x i32]]]** [[C_ADDR]], align 8 -// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x [10 x i32]]]** [[C_ADDR]], metadata [[META185:![0-9]+]], metadata !DIExpression()), !dbg [[DBG183]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x [10 x i32]]]** [[C_ADDR]], metadata [[META186:![0-9]+]], metadata !DIExpression()), !dbg [[DBG184]] // CHECK-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 -// CHECK-NEXT: call void @llvm.dbg.declare(metadata i64* [[A_ADDR]], metadata [[META186:![0-9]+]], metadata !DIExpression()), !dbg [[DBG183]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i64* [[A_ADDR]], metadata [[META187:![0-9]+]], metadata !DIExpression()), !dbg [[DBG184]] // CHECK-NEXT: store [10 x [10 x i32]]* [[B]], [10 x [10 x i32]]** [[B_ADDR]], align 8 -// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x i32]]** [[B_ADDR]], metadata [[META187:![0-9]+]], metadata !DIExpression()), !dbg [[DBG183]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x i32]]** [[B_ADDR]], metadata [[META188:![0-9]+]], metadata !DIExpression()), !dbg [[DBG184]] // CHECK-NEXT: store i8* [[BB]], i8** [[BB_ADDR]], align 8 -// CHECK-NEXT: call void @llvm.dbg.declare(metadata i8** [[BB_ADDR]], metadata [[META188:![0-9]+]], metadata !DIExpression()), !dbg [[DBG183]] -// CHECK-NEXT: [[TMP0:%.*]] = load [10 x [10 x [10 x i32]]]*, [10 x [10 x [10 x i32]]]** [[C_ADDR]], align 8, !dbg [[DBG189:![0-9]+]] -// CHECK-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32*, !dbg [[DBG189]] -// CHECK-NEXT: [[TMP1:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[B_ADDR]], align 8, !dbg [[DBG189]] -// CHECK-NEXT: [[TMP2:%.*]] = load i8*, i8** [[BB_ADDR]], align 8, !dbg [[DBG189]] -// CHECK-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG189]] -// CHECK-NEXT: [[TMP4:%.*]] = load i32*, i32** [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG189]] -// CHECK-NEXT: [[TMP5:%.*]] = load [10 x [10 x [10 x i32]]]*, [10 x [10 x [10 x i32]]]** [[C_ADDR]], align 8, !dbg [[DBG189]] -// CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[CONV]], align 8, !dbg [[DBG189]] -// CHECK-NEXT: [[TMP7:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[B_ADDR]], align 8, !dbg [[DBG189]] -// CHECK-NEXT: [[TMP8:%.*]] = load i8*, i8** [[BB_ADDR]], align 8, !dbg [[DBG189]] -// CHECK-NEXT: [[TMP9:%.*]] = addrspacecast [10 x [10 x [10 x i32]]]* [[TMP5]] to [10 x [10 x [10 x i32]]] addrspace(1)*, !dbg [[DBG189]] -// CHECK-NEXT: [[TMP10:%.*]] = addrspacecast [10 x [10 x i32]]* [[TMP7]] to [10 x [10 x i32]] addrspace(1)*, !dbg [[DBG189]] -// CHECK-NEXT: [[TMP11:%.*]] = addrspacecast i8* [[TMP8]] to i8 addrspace(1)*, !dbg [[DBG189]] -// CHECK-NEXT: call void @__omp_outlined___debug__1(i32* [[TMP3]], i32* [[TMP4]], [10 x [10 x [10 x i32]]] addrspace(1)* [[TMP9]], i32 [[TMP6]], [10 x [10 x i32]] addrspace(1)* [[TMP10]], i8 addrspace(1)* [[TMP11]]) #[[ATTR4]], !dbg [[DBG189]] -// CHECK-NEXT: ret void, !dbg [[DBG189]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i8** [[BB_ADDR]], metadata [[META189:![0-9]+]], metadata !DIExpression()), !dbg [[DBG184]] +// CHECK-NEXT: [[TMP0:%.*]] = load [10 x [10 x [10 x i32]]]*, [10 x [10 x [10 x i32]]]** [[C_ADDR]], align 8, !dbg [[DBG190:![0-9]+]] +// CHECK-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32*, !dbg [[DBG190]] +// CHECK-NEXT: [[TMP1:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[B_ADDR]], align 8, !dbg [[DBG190]] +// CHECK-NEXT: [[TMP2:%.*]] = load i8*, i8** [[BB_ADDR]], align 8, !dbg [[DBG190]] +// CHECK-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG190]] +// CHECK-NEXT: [[TMP4:%.*]] = load i32*, i32** [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG190]] +// CHECK-NEXT: [[TMP5:%.*]] = load [10 x [10 x [10 x i32]]]*, [10 x [10 x [10 x i32]]]** [[C_ADDR]], align 8, !dbg [[DBG190]] +// CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[CONV]], align 8, !dbg [[DBG190]] +// CHECK-NEXT: [[TMP7:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[B_ADDR]], align 8, !dbg [[DBG190]] +// CHECK-NEXT: [[TMP8:%.*]] = load i8*, i8** [[BB_ADDR]], align 8, !dbg [[DBG190]] +// CHECK-NEXT: [[TMP9:%.*]] = addrspacecast [10 x [10 x [10 x i32]]]* [[TMP5]] to [10 x [10 x [10 x i32]]] addrspace(1)*, !dbg [[DBG190]] +// CHECK-NEXT: [[TMP10:%.*]] = addrspacecast [10 x [10 x i32]]* [[TMP7]] to [10 x [10 x i32]] addrspace(1)*, !dbg [[DBG190]] +// CHECK-NEXT: [[TMP11:%.*]] = addrspacecast i8* [[TMP8]] to i8 addrspace(1)*, !dbg [[DBG190]] +// CHECK-NEXT: call void @__omp_outlined___debug__1(i32* [[TMP3]], i32* [[TMP4]], [10 x [10 x [10 x i32]]] addrspace(1)* [[TMP9]], i32 [[TMP6]], [10 x [10 x i32]] addrspace(1)* [[TMP10]], i8 addrspace(1)* [[TMP11]]) #[[ATTR4]], !dbg [[DBG190]] +// CHECK-NEXT: ret void, !dbg [[DBG190]] // // // CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37 -// CHECK-SAME: ([10 x [10 x [10 x i32]]]* nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 [[A:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[B:%.*]], i8* nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG190:![0-9]+]] { +// CHECK-SAME: ([10 x [10 x [10 x i32]]]* nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 [[A:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[B:%.*]], i8* nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG191:![0-9]+]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x [10 x i32]]]*, align 8 // CHECK-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 // CHECK-NEXT: [[B_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 // CHECK-NEXT: [[BB_ADDR:%.*]] = alloca i8*, align 8 // CHECK-NEXT: store [10 x [10 x [10 x i32]]]* [[C]], [10 x [10 x [10 x i32]]]** [[C_ADDR]], align 8 -// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x [10 x i32]]]** [[C_ADDR]], metadata [[META191:![0-9]+]], metadata !DIExpression()), !dbg [[DBG192:![0-9]+]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x [10 x i32]]]** [[C_ADDR]], metadata [[META192:![0-9]+]], metadata !DIExpression()), !dbg [[DBG193:![0-9]+]] // CHECK-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 -// CHECK-NEXT: call void @llvm.dbg.declare(metadata i64* [[A_ADDR]], metadata [[META193:![0-9]+]], metadata !DIExpression()), !dbg [[DBG192]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i64* [[A_ADDR]], metadata [[META194:![0-9]+]], metadata !DIExpression()), !dbg [[DBG193]] // CHECK-NEXT: store [10 x [10 x i32]]* [[B]], [10 x [10 x i32]]** [[B_ADDR]], align 8 -// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x i32]]** [[B_ADDR]], metadata [[META194:![0-9]+]], metadata !DIExpression()), !dbg [[DBG192]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x i32]]** [[B_ADDR]], metadata [[META195:![0-9]+]], metadata !DIExpression()), !dbg [[DBG193]] // CHECK-NEXT: store i8* [[BB]], i8** [[BB_ADDR]], align 8 -// CHECK-NEXT: call void @llvm.dbg.declare(metadata i8** [[BB_ADDR]], metadata [[META195:![0-9]+]], metadata !DIExpression()), !dbg [[DBG192]] -// CHECK-NEXT: [[TMP0:%.*]] = load [10 x [10 x [10 x i32]]]*, [10 x [10 x [10 x i32]]]** [[C_ADDR]], align 8, !dbg [[DBG196:![0-9]+]] -// CHECK-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32*, !dbg [[DBG196]] -// CHECK-NEXT: [[TMP1:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[B_ADDR]], align 8, !dbg [[DBG196]] -// CHECK-NEXT: [[TMP2:%.*]] = load i8*, i8** [[BB_ADDR]], align 8, !dbg [[DBG196]] -// CHECK-NEXT: [[TMP3:%.*]] = load [10 x [10 x [10 x i32]]]*, [10 x [10 x [10 x i32]]]** [[C_ADDR]], align 8, !dbg [[DBG196]] -// CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[CONV]], align 8, !dbg [[DBG196]] -// CHECK-NEXT: [[TMP5:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[B_ADDR]], align 8, !dbg [[DBG196]] -// CHECK-NEXT: [[TMP6:%.*]] = load i8*, i8** [[BB_ADDR]], align 8, !dbg [[DBG196]] -// CHECK-NEXT: [[TMP7:%.*]] = addrspacecast [10 x [10 x [10 x i32]]]* [[TMP3]] to [10 x [10 x [10 x i32]]] addrspace(1)*, !dbg [[DBG196]] -// CHECK-NEXT: [[TMP8:%.*]] = addrspacecast [10 x [10 x i32]]* [[TMP5]] to [10 x [10 x i32]] addrspace(1)*, !dbg [[DBG196]] -// CHECK-NEXT: [[TMP9:%.*]] = addrspacecast i8* [[TMP6]] to i8 addrspace(1)*, !dbg [[DBG196]] -// CHECK-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37_debug__([10 x [10 x [10 x i32]]] addrspace(1)* [[TMP7]], i32 [[TMP4]], [10 x [10 x i32]] addrspace(1)* [[TMP8]], i8 addrspace(1)* [[TMP9]]) #[[ATTR4]], !dbg [[DBG196]] -// CHECK-NEXT: ret void, !dbg [[DBG196]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i8** [[BB_ADDR]], metadata [[META196:![0-9]+]], metadata !DIExpression()), !dbg [[DBG193]] +// CHECK-NEXT: [[TMP0:%.*]] = load [10 x [10 x [10 x i32]]]*, [10 x [10 x [10 x i32]]]** [[C_ADDR]], align 8, !dbg [[DBG197:![0-9]+]] +// CHECK-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32*, !dbg [[DBG197]] +// CHECK-NEXT: [[TMP1:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[B_ADDR]], align 8, !dbg [[DBG197]] +// CHECK-NEXT: [[TMP2:%.*]] = load i8*, i8** [[BB_ADDR]], align 8, !dbg [[DBG197]] +// CHECK-NEXT: [[TMP3:%.*]] = load [10 x [10 x [10 x i32]]]*, [10 x [10 x [10 x i32]]]** [[C_ADDR]], align 8, !dbg [[DBG197]] +// CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[CONV]], align 8, !dbg [[DBG197]] +// CHECK-NEXT: [[TMP5:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[B_ADDR]], align 8, !dbg [[DBG197]] +// CHECK-NEXT: [[TMP6:%.*]] = load i8*, i8** [[BB_ADDR]], align 8, !dbg [[DBG197]] +// CHECK-NEXT: [[TMP7:%.*]] = addrspacecast [10 x [10 x [10 x i32]]]* [[TMP3]] to [10 x [10 x [10 x i32]]] addrspace(1)*, !dbg [[DBG197]] +// CHECK-NEXT: [[TMP8:%.*]] = addrspacecast [10 x [10 x i32]]* [[TMP5]] to [10 x [10 x i32]] addrspace(1)*, !dbg [[DBG197]] +// CHECK-NEXT: [[TMP9:%.*]] = addrspacecast i8* [[TMP6]] to i8 addrspace(1)*, !dbg [[DBG197]] +// CHECK-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37_debug__([10 x [10 x [10 x i32]]] addrspace(1)* [[TMP7]], i32 [[TMP4]], [10 x [10 x i32]] addrspace(1)* [[TMP8]], i8 addrspace(1)* [[TMP9]]) #[[ATTR4]], !dbg [[DBG197]] +// CHECK-NEXT: ret void, !dbg [[DBG197]] // // // CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l51_debug__ -// CHECK-SAME: ([10 x [10 x [10 x i32]]] addrspace(1)* noalias [[C:%.*]], i32 addrspace(1)* noalias [[A:%.*]], [10 x [10 x i32]] addrspace(1)* noalias [[B:%.*]], i8 addrspace(1)* noalias [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG197:![0-9]+]] { +// CHECK-SAME: ([10 x [10 x [10 x i32]]] addrspace(1)* noalias [[C:%.*]], i32 addrspace(1)* noalias [[A:%.*]], [10 x [10 x i32]] addrspace(1)* noalias [[B:%.*]], i8 addrspace(1)* noalias [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG198:![0-9]+]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x [10 x i32]]] addrspace(1)*, align 8 // CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32 addrspace(1)*, align 8 @@ -514,58 +512,57 @@ // CHECK-NEXT: [[_TMP3:%.*]] = alloca i8*, align 8 // CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 // CHECK-NEXT: store [10 x [10 x [10 x i32]]] addrspace(1)* [[C]], [10 x [10 x [10 x i32]]] addrspace(1)** [[C_ADDR]], align 8 -// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x [10 x i32]]] addrspace(1)** [[C_ADDR]], metadata [[META202:![0-9]+]], metadata !DIExpression()), !dbg [[DBG203:![0-9]+]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x [10 x i32]]] addrspace(1)** [[C_ADDR]], metadata [[META203:![0-9]+]], metadata !DIExpression()), !dbg [[DBG204:![0-9]+]] // CHECK-NEXT: store i32 addrspace(1)* [[A]], i32 addrspace(1)** [[A_ADDR]], align 8 -// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32 addrspace(1)** [[A_ADDR]], metadata [[META204:![0-9]+]], metadata !DIExpression()), !dbg [[DBG205:![0-9]+]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32 addrspace(1)** [[A_ADDR]], metadata [[META205:![0-9]+]], metadata !DIExpression()), !dbg [[DBG206:![0-9]+]] // CHECK-NEXT: store [10 x [10 x i32]] addrspace(1)* [[B]], [10 x [10 x i32]] addrspace(1)** [[B_ADDR]], align 8 -// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x i32]] addrspace(1)** [[B_ADDR]], metadata [[META206:![0-9]+]], metadata !DIExpression()), !dbg [[DBG207:![0-9]+]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x i32]] addrspace(1)** [[B_ADDR]], metadata [[META207:![0-9]+]], metadata !DIExpression()), !dbg [[DBG208:![0-9]+]] // CHECK-NEXT: store i8 addrspace(1)* [[BB]], i8 addrspace(1)** [[BB_ADDR]], align 8 -// CHECK-NEXT: call void @llvm.dbg.declare(metadata i8 addrspace(1)** [[BB_ADDR]], metadata [[META208:![0-9]+]], metadata !DIExpression()), !dbg [[DBG209:![0-9]+]] -// CHECK-NEXT: [[TMP0:%.*]] = load [10 x [10 x [10 x i32]]] addrspace(1)*, [10 x [10 x [10 x i32]]] addrspace(1)** [[C_ADDR]], align 8, !dbg [[DBG210:![0-9]+]] -// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast [10 x [10 x [10 x i32]]] addrspace(1)* [[TMP0]] to [10 x [10 x [10 x i32]]]*, !dbg [[DBG210]] -// CHECK-NEXT: store [10 x [10 x [10 x i32]]]* [[TMP1]], [10 x [10 x [10 x i32]]]** [[TMP]], align 8, !dbg [[DBG210]] -// CHECK-NEXT: [[TMP2:%.*]] = load [10 x [10 x [10 x i32]]]*, [10 x [10 x [10 x i32]]]** [[TMP]], align 8, !dbg [[DBG210]] -// CHECK-NEXT: [[TMP3:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)** [[A_ADDR]], align 8, !dbg [[DBG210]] -// CHECK-NEXT: [[TMP4:%.*]] = addrspacecast i32 addrspace(1)* [[TMP3]] to i32*, !dbg [[DBG210]] -// CHECK-NEXT: store i32* [[TMP4]], i32** [[_TMP1]], align 8, !dbg [[DBG210]] -// CHECK-NEXT: [[TMP5:%.*]] = load i32*, i32** [[_TMP1]], align 8, !dbg [[DBG210]] -// CHECK-NEXT: [[TMP6:%.*]] = load [10 x [10 x i32]] addrspace(1)*, [10 x [10 x i32]] addrspace(1)** [[B_ADDR]], align 8, !dbg [[DBG210]] -// CHECK-NEXT: [[TMP7:%.*]] = addrspacecast [10 x [10 x i32]] addrspace(1)* [[TMP6]] to [10 x [10 x i32]]*, !dbg [[DBG210]] -// CHECK-NEXT: store [10 x [10 x i32]]* [[TMP7]], [10 x [10 x i32]]** [[_TMP2]], align 8, !dbg [[DBG210]] -// CHECK-NEXT: [[TMP8:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[_TMP2]], align 8, !dbg [[DBG210]] -// CHECK-NEXT: [[TMP9:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)** [[BB_ADDR]], align 8, !dbg [[DBG210]] -// CHECK-NEXT: [[TMP10:%.*]] = addrspacecast i8 addrspace(1)* [[TMP9]] to i8*, !dbg [[DBG210]] -// CHECK-NEXT: store i8* [[TMP10]], i8** [[_TMP3]], align 8, !dbg [[DBG210]] -// CHECK-NEXT: [[TMP11:%.*]] = load i8*, i8** [[_TMP3]], align 8, !dbg [[DBG210]] -// CHECK-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !dbg [[DBG210]] -// CHECK-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1), !dbg [[DBG210]] -// CHECK-NEXT: call void @__kmpc_data_sharing_init_stack_spmd(), !dbg [[DBG210]] -// CHECK-NEXT: br label [[DOTEXECUTE:%.*]], !dbg [[DBG210]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i8 addrspace(1)** [[BB_ADDR]], metadata [[META209:![0-9]+]], metadata !DIExpression()), !dbg [[DBG210:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = load [10 x [10 x [10 x i32]]] addrspace(1)*, [10 x [10 x [10 x i32]]] addrspace(1)** [[C_ADDR]], align 8, !dbg [[DBG211:![0-9]+]] +// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast [10 x [10 x [10 x i32]]] addrspace(1)* [[TMP0]] to [10 x [10 x [10 x i32]]]*, !dbg [[DBG211]] +// CHECK-NEXT: store [10 x [10 x [10 x i32]]]* [[TMP1]], [10 x [10 x [10 x i32]]]** [[TMP]], align 8, !dbg [[DBG211]] +// CHECK-NEXT: [[TMP2:%.*]] = load [10 x [10 x [10 x i32]]]*, [10 x [10 x [10 x i32]]]** [[TMP]], align 8, !dbg [[DBG211]] +// CHECK-NEXT: [[TMP3:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)** [[A_ADDR]], align 8, !dbg [[DBG211]] +// CHECK-NEXT: [[TMP4:%.*]] = addrspacecast i32 addrspace(1)* [[TMP3]] to i32*, !dbg [[DBG211]] +// CHECK-NEXT: store i32* [[TMP4]], i32** [[_TMP1]], align 8, !dbg [[DBG211]] +// CHECK-NEXT: [[TMP5:%.*]] = load i32*, i32** [[_TMP1]], align 8, !dbg [[DBG211]] +// CHECK-NEXT: [[TMP6:%.*]] = load [10 x [10 x i32]] addrspace(1)*, [10 x [10 x i32]] addrspace(1)** [[B_ADDR]], align 8, !dbg [[DBG211]] +// CHECK-NEXT: [[TMP7:%.*]] = addrspacecast [10 x [10 x i32]] addrspace(1)* [[TMP6]] to [10 x [10 x i32]]*, !dbg [[DBG211]] +// CHECK-NEXT: store [10 x [10 x i32]]* [[TMP7]], [10 x [10 x i32]]** [[_TMP2]], align 8, !dbg [[DBG211]] +// CHECK-NEXT: [[TMP8:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[_TMP2]], align 8, !dbg [[DBG211]] +// CHECK-NEXT: [[TMP9:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)** [[BB_ADDR]], align 8, !dbg [[DBG211]] +// CHECK-NEXT: [[TMP10:%.*]] = addrspacecast i8 addrspace(1)* [[TMP9]] to i8*, !dbg [[DBG211]] +// CHECK-NEXT: store i8* [[TMP10]], i8** [[_TMP3]], align 8, !dbg [[DBG211]] +// CHECK-NEXT: [[TMP11:%.*]] = load i8*, i8** [[_TMP3]], align 8, !dbg [[DBG211]] +// CHECK-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !dbg [[DBG211]], !range [[RNG46]] +// CHECK-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1), !dbg [[DBG211]] +// CHECK-NEXT: br label [[DOTEXECUTE:%.*]], !dbg [[DBG211]] // CHECK: .execute: // CHECK-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB5:[0-9]+]]) -// CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG211:![0-9]+]] -// CHECK-NEXT: [[TMP14:%.*]] = bitcast [10 x [10 x [10 x i32]]]* [[TMP2]] to i8*, !dbg [[DBG211]] -// CHECK-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8, !dbg [[DBG211]] -// CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG211]] -// CHECK-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP5]] to i8*, !dbg [[DBG211]] -// CHECK-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8, !dbg [[DBG211]] -// CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG211]] -// CHECK-NEXT: [[TMP18:%.*]] = bitcast [10 x [10 x i32]]* [[TMP8]] to i8*, !dbg [[DBG211]] -// CHECK-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 8, !dbg [[DBG211]] -// CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG211]] -// CHECK-NEXT: store i8* [[TMP11]], i8** [[TMP19]], align 8, !dbg [[DBG211]] -// CHECK-NEXT: [[TMP20:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**, !dbg [[DBG211]] -// CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB5]], i32 [[TMP12]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, [10 x [10 x [10 x i32]]]*, i32*, [10 x [10 x i32]]*, i8*)* @__omp_outlined__4 to i8*), i8* null, i8** [[TMP20]], i64 4), !dbg [[DBG211]] -// CHECK-NEXT: br label [[DOTOMP_DEINIT:%.*]], !dbg [[DBG212:![0-9]+]] +// CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG212:![0-9]+]] +// CHECK-NEXT: [[TMP14:%.*]] = bitcast [10 x [10 x [10 x i32]]]* [[TMP2]] to i8*, !dbg [[DBG212]] +// CHECK-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8, !dbg [[DBG212]] +// CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG212]] +// CHECK-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP5]] to i8*, !dbg [[DBG212]] +// CHECK-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8, !dbg [[DBG212]] +// CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG212]] +// CHECK-NEXT: [[TMP18:%.*]] = bitcast [10 x [10 x i32]]* [[TMP8]] to i8*, !dbg [[DBG212]] +// CHECK-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 8, !dbg [[DBG212]] +// CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG212]] +// CHECK-NEXT: store i8* [[TMP11]], i8** [[TMP19]], align 8, !dbg [[DBG212]] +// CHECK-NEXT: [[TMP20:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**, !dbg [[DBG212]] +// CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB5]], i32 [[TMP12]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, [10 x [10 x [10 x i32]]]*, i32*, [10 x [10 x i32]]*, i8*)* @__omp_outlined__4 to i8*), i8* null, i8** [[TMP20]], i64 4), !dbg [[DBG212]] +// CHECK-NEXT: br label [[DOTOMP_DEINIT:%.*]], !dbg [[DBG213:![0-9]+]] // CHECK: .omp.deinit: -// CHECK-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1), !dbg [[DBG212]] -// CHECK-NEXT: br label [[DOTEXIT:%.*]], !dbg [[DBG212]] +// CHECK-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1), !dbg [[DBG213]] +// CHECK-NEXT: br label [[DOTEXIT:%.*]], !dbg [[DBG213]] // CHECK: .exit: -// CHECK-NEXT: ret void, !dbg [[DBG214:![0-9]+]] +// CHECK-NEXT: ret void, !dbg [[DBG215:![0-9]+]] // // // CHECK-LABEL: define {{[^@]+}}@__omp_outlined___debug__3 -// CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x [10 x i32]]] addrspace(1)* noalias [[C:%.*]], i32 addrspace(1)* noalias [[A:%.*]], [10 x [10 x i32]] addrspace(1)* noalias [[B:%.*]], i8 addrspace(1)* noalias [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG215:![0-9]+]] { +// CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x [10 x i32]]] addrspace(1)* noalias [[C:%.*]], i32 addrspace(1)* noalias [[A:%.*]], [10 x [10 x i32]] addrspace(1)* noalias [[B:%.*]], i8 addrspace(1)* noalias [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG216:![0-9]+]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 // CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 @@ -582,82 +579,82 @@ // CHECK-NEXT: [[H:%.*]] = alloca i32*, align 8 // CHECK-NEXT: [[D:%.*]] = alloca i32, align 4 // CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[DOTGLOBAL_TID__ADDR]], metadata [[META218:![0-9]+]], metadata !DIExpression()), !dbg [[DBG219:![0-9]+]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[DOTGLOBAL_TID__ADDR]], metadata [[META219:![0-9]+]], metadata !DIExpression()), !dbg [[DBG220:![0-9]+]] // CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[DOTBOUND_TID__ADDR]], metadata [[META220:![0-9]+]], metadata !DIExpression()), !dbg [[DBG219]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[DOTBOUND_TID__ADDR]], metadata [[META221:![0-9]+]], metadata !DIExpression()), !dbg [[DBG220]] // CHECK-NEXT: store [10 x [10 x [10 x i32]]] addrspace(1)* [[C]], [10 x [10 x [10 x i32]]] addrspace(1)** [[C_ADDR]], align 8 -// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x [10 x i32]]] addrspace(1)** [[C_ADDR]], metadata [[META221:![0-9]+]], metadata !DIExpression()), !dbg [[DBG222:![0-9]+]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x [10 x i32]]] addrspace(1)** [[C_ADDR]], metadata [[META222:![0-9]+]], metadata !DIExpression()), !dbg [[DBG223:![0-9]+]] // CHECK-NEXT: store i32 addrspace(1)* [[A]], i32 addrspace(1)** [[A_ADDR]], align 8 -// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32 addrspace(1)** [[A_ADDR]], metadata [[META223:![0-9]+]], metadata !DIExpression()), !dbg [[DBG224:![0-9]+]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32 addrspace(1)** [[A_ADDR]], metadata [[META224:![0-9]+]], metadata !DIExpression()), !dbg [[DBG225:![0-9]+]] // CHECK-NEXT: store [10 x [10 x i32]] addrspace(1)* [[B]], [10 x [10 x i32]] addrspace(1)** [[B_ADDR]], align 8 -// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x i32]] addrspace(1)** [[B_ADDR]], metadata [[META225:![0-9]+]], metadata !DIExpression()), !dbg [[DBG226:![0-9]+]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x i32]] addrspace(1)** [[B_ADDR]], metadata [[META226:![0-9]+]], metadata !DIExpression()), !dbg [[DBG227:![0-9]+]] // CHECK-NEXT: store i8 addrspace(1)* [[BB]], i8 addrspace(1)** [[BB_ADDR]], align 8 -// CHECK-NEXT: call void @llvm.dbg.declare(metadata i8 addrspace(1)** [[BB_ADDR]], metadata [[META227:![0-9]+]], metadata !DIExpression()), !dbg [[DBG228:![0-9]+]] -// CHECK-NEXT: [[TMP0:%.*]] = load [10 x [10 x [10 x i32]]] addrspace(1)*, [10 x [10 x [10 x i32]]] addrspace(1)** [[C_ADDR]], align 8, !dbg [[DBG229:![0-9]+]] -// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast [10 x [10 x [10 x i32]]] addrspace(1)* [[TMP0]] to [10 x [10 x [10 x i32]]]*, !dbg [[DBG229]] -// CHECK-NEXT: store [10 x [10 x [10 x i32]]]* [[TMP1]], [10 x [10 x [10 x i32]]]** [[TMP]], align 8, !dbg [[DBG229]] -// CHECK-NEXT: [[TMP2:%.*]] = load [10 x [10 x [10 x i32]]]*, [10 x [10 x [10 x i32]]]** [[TMP]], align 8, !dbg [[DBG229]] -// CHECK-NEXT: [[TMP3:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)** [[A_ADDR]], align 8, !dbg [[DBG229]] -// CHECK-NEXT: [[TMP4:%.*]] = addrspacecast i32 addrspace(1)* [[TMP3]] to i32*, !dbg [[DBG229]] -// CHECK-NEXT: store i32* [[TMP4]], i32** [[_TMP1]], align 8, !dbg [[DBG229]] -// CHECK-NEXT: [[TMP5:%.*]] = load i32*, i32** [[_TMP1]], align 8, !dbg [[DBG229]] -// CHECK-NEXT: [[TMP6:%.*]] = load [10 x [10 x i32]] addrspace(1)*, [10 x [10 x i32]] addrspace(1)** [[B_ADDR]], align 8, !dbg [[DBG229]] -// CHECK-NEXT: [[TMP7:%.*]] = addrspacecast [10 x [10 x i32]] addrspace(1)* [[TMP6]] to [10 x [10 x i32]]*, !dbg [[DBG229]] -// CHECK-NEXT: store [10 x [10 x i32]]* [[TMP7]], [10 x [10 x i32]]** [[_TMP2]], align 8, !dbg [[DBG229]] -// CHECK-NEXT: [[TMP8:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[_TMP2]], align 8, !dbg [[DBG229]] -// CHECK-NEXT: [[TMP9:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)** [[BB_ADDR]], align 8, !dbg [[DBG229]] -// CHECK-NEXT: [[TMP10:%.*]] = addrspacecast i8 addrspace(1)* [[TMP9]] to i8*, !dbg [[DBG229]] -// CHECK-NEXT: store i8* [[TMP10]], i8** [[_TMP3]], align 8, !dbg [[DBG229]] -// CHECK-NEXT: [[TMP11:%.*]] = load i8*, i8** [[_TMP3]], align 8, !dbg [[DBG229]] -// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[F]], metadata [[META230:![0-9]+]], metadata !DIExpression()), !dbg [[DBG232:![0-9]+]] -// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], [10 x [10 x [10 x i32]]]* [[TMP2]], i64 0, i64 1, !dbg [[DBG233:![0-9]+]] -// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG233]] -// CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX4]], i64 0, i64 1, !dbg [[DBG233]] -// CHECK-NEXT: store i32* [[ARRAYIDX5]], i32** [[F]], align 8, !dbg [[DBG232]] -// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[G]], metadata [[META234:![0-9]+]], metadata !DIExpression()), !dbg [[DBG235:![0-9]+]] -// CHECK-NEXT: store i32* [[TMP5]], i32** [[G]], align 8, !dbg [[DBG235]] -// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[H]], metadata [[META236:![0-9]+]], metadata !DIExpression()), !dbg [[DBG237:![0-9]+]] -// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP8]], i64 0, i64 1, !dbg [[DBG238:![0-9]+]] -// CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX6]], i64 0, i64 1, !dbg [[DBG238]] -// CHECK-NEXT: store i32* [[ARRAYIDX7]], i32** [[H]], align 8, !dbg [[DBG237]] -// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32* [[D]], metadata [[META239:![0-9]+]], metadata !DIExpression()), !dbg [[DBG240:![0-9]+]] -// CHECK-NEXT: store i32 15, i32* [[D]], align 4, !dbg [[DBG240]] -// CHECK-NEXT: store i32 5, i32* [[TMP5]], align 4, !dbg [[DBG241:![0-9]+]] -// CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP8]], i64 0, i64 0, !dbg [[DBG242:![0-9]+]] -// CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP5]], align 4, !dbg [[DBG243:![0-9]+]] -// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP12]] to i64, !dbg [[DBG242]] -// CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX8]], i64 0, i64 [[IDXPROM]], !dbg [[DBG242]] -// CHECK-NEXT: store i32 10, i32* [[ARRAYIDX9]], align 4, !dbg [[DBG244:![0-9]+]] -// CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], [10 x [10 x [10 x i32]]]* [[TMP2]], i64 0, i64 0, !dbg [[DBG245:![0-9]+]] -// CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[ARRAYIDX10]], i64 0, i64 0, !dbg [[DBG245]] -// CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP5]], align 4, !dbg [[DBG246:![0-9]+]] -// CHECK-NEXT: [[IDXPROM12:%.*]] = sext i32 [[TMP13]] to i64, !dbg [[DBG245]] -// CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX11]], i64 0, i64 [[IDXPROM12]], !dbg [[DBG245]] -// CHECK-NEXT: store i32 11, i32* [[ARRAYIDX13]], align 4, !dbg [[DBG247:![0-9]+]] -// CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], [10 x [10 x [10 x i32]]]* [[TMP2]], i64 0, i64 0, !dbg [[DBG248:![0-9]+]] -// CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[ARRAYIDX14]], i64 0, i64 0, !dbg [[DBG248]] -// CHECK-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP5]], align 4, !dbg [[DBG249:![0-9]+]] -// CHECK-NEXT: [[IDXPROM16:%.*]] = sext i32 [[TMP14]] to i64, !dbg [[DBG248]] -// CHECK-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX15]], i64 0, i64 [[IDXPROM16]], !dbg [[DBG248]] -// CHECK-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX17]], align 4, !dbg [[DBG248]] -// CHECK-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP8]], i64 0, i64 0, !dbg [[DBG250:![0-9]+]] -// CHECK-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP5]], align 4, !dbg [[DBG251:![0-9]+]] -// CHECK-NEXT: [[IDXPROM19:%.*]] = sext i32 [[TMP16]] to i64, !dbg [[DBG250]] -// CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX18]], i64 0, i64 [[IDXPROM19]], !dbg [[DBG250]] -// CHECK-NEXT: store i32 [[TMP15]], i32* [[ARRAYIDX20]], align 4, !dbg [[DBG252:![0-9]+]] -// CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP8]], i64 0, i64 0, !dbg [[DBG253:![0-9]+]] -// CHECK-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP5]], align 4, !dbg [[DBG254:![0-9]+]] -// CHECK-NEXT: [[IDXPROM22:%.*]] = sext i32 [[TMP17]] to i64, !dbg [[DBG253]] -// CHECK-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX21]], i64 0, i64 [[IDXPROM22]], !dbg [[DBG253]] -// CHECK-NEXT: [[TMP18:%.*]] = load i32, i32* [[ARRAYIDX23]], align 4, !dbg [[DBG253]] -// CHECK-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP18]], 0, !dbg [[DBG253]] -// CHECK-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8, !dbg [[DBG255:![0-9]+]] -// CHECK-NEXT: store i8 [[FROMBOOL]], i8* [[TMP11]], align 1, !dbg [[DBG255]] -// CHECK-NEXT: ret void, !dbg [[DBG256:![0-9]+]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i8 addrspace(1)** [[BB_ADDR]], metadata [[META228:![0-9]+]], metadata !DIExpression()), !dbg [[DBG229:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = load [10 x [10 x [10 x i32]]] addrspace(1)*, [10 x [10 x [10 x i32]]] addrspace(1)** [[C_ADDR]], align 8, !dbg [[DBG230:![0-9]+]] +// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast [10 x [10 x [10 x i32]]] addrspace(1)* [[TMP0]] to [10 x [10 x [10 x i32]]]*, !dbg [[DBG230]] +// CHECK-NEXT: store [10 x [10 x [10 x i32]]]* [[TMP1]], [10 x [10 x [10 x i32]]]** [[TMP]], align 8, !dbg [[DBG230]] +// CHECK-NEXT: [[TMP2:%.*]] = load [10 x [10 x [10 x i32]]]*, [10 x [10 x [10 x i32]]]** [[TMP]], align 8, !dbg [[DBG230]] +// CHECK-NEXT: [[TMP3:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)** [[A_ADDR]], align 8, !dbg [[DBG230]] +// CHECK-NEXT: [[TMP4:%.*]] = addrspacecast i32 addrspace(1)* [[TMP3]] to i32*, !dbg [[DBG230]] +// CHECK-NEXT: store i32* [[TMP4]], i32** [[_TMP1]], align 8, !dbg [[DBG230]] +// CHECK-NEXT: [[TMP5:%.*]] = load i32*, i32** [[_TMP1]], align 8, !dbg [[DBG230]] +// CHECK-NEXT: [[TMP6:%.*]] = load [10 x [10 x i32]] addrspace(1)*, [10 x [10 x i32]] addrspace(1)** [[B_ADDR]], align 8, !dbg [[DBG230]] +// CHECK-NEXT: [[TMP7:%.*]] = addrspacecast [10 x [10 x i32]] addrspace(1)* [[TMP6]] to [10 x [10 x i32]]*, !dbg [[DBG230]] +// CHECK-NEXT: store [10 x [10 x i32]]* [[TMP7]], [10 x [10 x i32]]** [[_TMP2]], align 8, !dbg [[DBG230]] +// CHECK-NEXT: [[TMP8:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[_TMP2]], align 8, !dbg [[DBG230]] +// CHECK-NEXT: [[TMP9:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)** [[BB_ADDR]], align 8, !dbg [[DBG230]] +// CHECK-NEXT: [[TMP10:%.*]] = addrspacecast i8 addrspace(1)* [[TMP9]] to i8*, !dbg [[DBG230]] +// CHECK-NEXT: store i8* [[TMP10]], i8** [[_TMP3]], align 8, !dbg [[DBG230]] +// CHECK-NEXT: [[TMP11:%.*]] = load i8*, i8** [[_TMP3]], align 8, !dbg [[DBG230]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[F]], metadata [[META231:![0-9]+]], metadata !DIExpression()), !dbg [[DBG233:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], [10 x [10 x [10 x i32]]]* [[TMP2]], i64 0, i64 1, !dbg [[DBG234:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG234]] +// CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX4]], i64 0, i64 1, !dbg [[DBG234]] +// CHECK-NEXT: store i32* [[ARRAYIDX5]], i32** [[F]], align 8, !dbg [[DBG233]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[G]], metadata [[META235:![0-9]+]], metadata !DIExpression()), !dbg [[DBG236:![0-9]+]] +// CHECK-NEXT: store i32* [[TMP5]], i32** [[G]], align 8, !dbg [[DBG236]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[H]], metadata [[META237:![0-9]+]], metadata !DIExpression()), !dbg [[DBG238:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP8]], i64 0, i64 1, !dbg [[DBG239:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX6]], i64 0, i64 1, !dbg [[DBG239]] +// CHECK-NEXT: store i32* [[ARRAYIDX7]], i32** [[H]], align 8, !dbg [[DBG238]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32* [[D]], metadata [[META240:![0-9]+]], metadata !DIExpression()), !dbg [[DBG241:![0-9]+]] +// CHECK-NEXT: store i32 15, i32* [[D]], align 4, !dbg [[DBG241]] +// CHECK-NEXT: store i32 5, i32* [[TMP5]], align 4, !dbg [[DBG242:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP8]], i64 0, i64 0, !dbg [[DBG243:![0-9]+]] +// CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP5]], align 4, !dbg [[DBG244:![0-9]+]] +// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP12]] to i64, !dbg [[DBG243]] +// CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX8]], i64 0, i64 [[IDXPROM]], !dbg [[DBG243]] +// CHECK-NEXT: store i32 10, i32* [[ARRAYIDX9]], align 4, !dbg [[DBG245:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], [10 x [10 x [10 x i32]]]* [[TMP2]], i64 0, i64 0, !dbg [[DBG246:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[ARRAYIDX10]], i64 0, i64 0, !dbg [[DBG246]] +// CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP5]], align 4, !dbg [[DBG247:![0-9]+]] +// CHECK-NEXT: [[IDXPROM12:%.*]] = sext i32 [[TMP13]] to i64, !dbg [[DBG246]] +// CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX11]], i64 0, i64 [[IDXPROM12]], !dbg [[DBG246]] +// CHECK-NEXT: store i32 11, i32* [[ARRAYIDX13]], align 4, !dbg [[DBG248:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], [10 x [10 x [10 x i32]]]* [[TMP2]], i64 0, i64 0, !dbg [[DBG249:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[ARRAYIDX14]], i64 0, i64 0, !dbg [[DBG249]] +// CHECK-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP5]], align 4, !dbg [[DBG250:![0-9]+]] +// CHECK-NEXT: [[IDXPROM16:%.*]] = sext i32 [[TMP14]] to i64, !dbg [[DBG249]] +// CHECK-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX15]], i64 0, i64 [[IDXPROM16]], !dbg [[DBG249]] +// CHECK-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX17]], align 4, !dbg [[DBG249]] +// CHECK-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP8]], i64 0, i64 0, !dbg [[DBG251:![0-9]+]] +// CHECK-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP5]], align 4, !dbg [[DBG252:![0-9]+]] +// CHECK-NEXT: [[IDXPROM19:%.*]] = sext i32 [[TMP16]] to i64, !dbg [[DBG251]] +// CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX18]], i64 0, i64 [[IDXPROM19]], !dbg [[DBG251]] +// CHECK-NEXT: store i32 [[TMP15]], i32* [[ARRAYIDX20]], align 4, !dbg [[DBG253:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP8]], i64 0, i64 0, !dbg [[DBG254:![0-9]+]] +// CHECK-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP5]], align 4, !dbg [[DBG255:![0-9]+]] +// CHECK-NEXT: [[IDXPROM22:%.*]] = sext i32 [[TMP17]] to i64, !dbg [[DBG254]] +// CHECK-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX21]], i64 0, i64 [[IDXPROM22]], !dbg [[DBG254]] +// CHECK-NEXT: [[TMP18:%.*]] = load i32, i32* [[ARRAYIDX23]], align 4, !dbg [[DBG254]] +// CHECK-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP18]], 0, !dbg [[DBG254]] +// CHECK-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8, !dbg [[DBG256:![0-9]+]] +// CHECK-NEXT: store i8 [[FROMBOOL]], i8* [[TMP11]], align 1, !dbg [[DBG256]] +// CHECK-NEXT: ret void, !dbg [[DBG257:![0-9]+]] // // // CHECK-LABEL: define {{[^@]+}}@__omp_outlined__4 -// CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x [10 x i32]]]* nonnull align 4 dereferenceable(4000) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[B:%.*]], i8* nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG257:![0-9]+]] { +// CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x [10 x i32]]]* nonnull align 4 dereferenceable(4000) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[B:%.*]], i8* nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG258:![0-9]+]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 // CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 @@ -666,62 +663,62 @@ // CHECK-NEXT: [[B_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 // CHECK-NEXT: [[BB_ADDR:%.*]] = alloca i8*, align 8 // CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[DOTGLOBAL_TID__ADDR]], metadata [[META260:![0-9]+]], metadata !DIExpression()), !dbg [[DBG261:![0-9]+]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[DOTGLOBAL_TID__ADDR]], metadata [[META261:![0-9]+]], metadata !DIExpression()), !dbg [[DBG262:![0-9]+]] // CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[DOTBOUND_TID__ADDR]], metadata [[META262:![0-9]+]], metadata !DIExpression()), !dbg [[DBG261]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[DOTBOUND_TID__ADDR]], metadata [[META263:![0-9]+]], metadata !DIExpression()), !dbg [[DBG262]] // CHECK-NEXT: store [10 x [10 x [10 x i32]]]* [[C]], [10 x [10 x [10 x i32]]]** [[C_ADDR]], align 8 -// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x [10 x i32]]]** [[C_ADDR]], metadata [[META263:![0-9]+]], metadata !DIExpression()), !dbg [[DBG261]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x [10 x i32]]]** [[C_ADDR]], metadata [[META264:![0-9]+]], metadata !DIExpression()), !dbg [[DBG262]] // CHECK-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 -// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[A_ADDR]], metadata [[META264:![0-9]+]], metadata !DIExpression()), !dbg [[DBG261]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[A_ADDR]], metadata [[META265:![0-9]+]], metadata !DIExpression()), !dbg [[DBG262]] // CHECK-NEXT: store [10 x [10 x i32]]* [[B]], [10 x [10 x i32]]** [[B_ADDR]], align 8 -// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x i32]]** [[B_ADDR]], metadata [[META265:![0-9]+]], metadata !DIExpression()), !dbg [[DBG261]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x i32]]** [[B_ADDR]], metadata [[META266:![0-9]+]], metadata !DIExpression()), !dbg [[DBG262]] // CHECK-NEXT: store i8* [[BB]], i8** [[BB_ADDR]], align 8 -// CHECK-NEXT: call void @llvm.dbg.declare(metadata i8** [[BB_ADDR]], metadata [[META266:![0-9]+]], metadata !DIExpression()), !dbg [[DBG261]] -// CHECK-NEXT: [[TMP0:%.*]] = load [10 x [10 x [10 x i32]]]*, [10 x [10 x [10 x i32]]]** [[C_ADDR]], align 8, !dbg [[DBG267:![0-9]+]] -// CHECK-NEXT: [[TMP1:%.*]] = load i32*, i32** [[A_ADDR]], align 8, !dbg [[DBG267]] -// CHECK-NEXT: [[TMP2:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[B_ADDR]], align 8, !dbg [[DBG267]] -// CHECK-NEXT: [[TMP3:%.*]] = load i8*, i8** [[BB_ADDR]], align 8, !dbg [[DBG267]] -// CHECK-NEXT: [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG267]] -// CHECK-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG267]] -// CHECK-NEXT: [[TMP6:%.*]] = load [10 x [10 x [10 x i32]]]*, [10 x [10 x [10 x i32]]]** [[C_ADDR]], align 8, !dbg [[DBG267]] -// CHECK-NEXT: [[TMP7:%.*]] = load i32*, i32** [[A_ADDR]], align 8, !dbg [[DBG267]] -// CHECK-NEXT: [[TMP8:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[B_ADDR]], align 8, !dbg [[DBG267]] -// CHECK-NEXT: [[TMP9:%.*]] = load i8*, i8** [[BB_ADDR]], align 8, !dbg [[DBG267]] -// CHECK-NEXT: [[TMP10:%.*]] = addrspacecast [10 x [10 x [10 x i32]]]* [[TMP6]] to [10 x [10 x [10 x i32]]] addrspace(1)*, !dbg [[DBG267]] -// CHECK-NEXT: [[TMP11:%.*]] = addrspacecast i32* [[TMP7]] to i32 addrspace(1)*, !dbg [[DBG267]] -// CHECK-NEXT: [[TMP12:%.*]] = addrspacecast [10 x [10 x i32]]* [[TMP8]] to [10 x [10 x i32]] addrspace(1)*, !dbg [[DBG267]] -// CHECK-NEXT: [[TMP13:%.*]] = addrspacecast i8* [[TMP9]] to i8 addrspace(1)*, !dbg [[DBG267]] -// CHECK-NEXT: call void @__omp_outlined___debug__3(i32* [[TMP4]], i32* [[TMP5]], [10 x [10 x [10 x i32]]] addrspace(1)* [[TMP10]], i32 addrspace(1)* [[TMP11]], [10 x [10 x i32]] addrspace(1)* [[TMP12]], i8 addrspace(1)* [[TMP13]]) #[[ATTR4]], !dbg [[DBG267]] -// CHECK-NEXT: ret void, !dbg [[DBG267]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i8** [[BB_ADDR]], metadata [[META267:![0-9]+]], metadata !DIExpression()), !dbg [[DBG262]] +// CHECK-NEXT: [[TMP0:%.*]] = load [10 x [10 x [10 x i32]]]*, [10 x [10 x [10 x i32]]]** [[C_ADDR]], align 8, !dbg [[DBG268:![0-9]+]] +// CHECK-NEXT: [[TMP1:%.*]] = load i32*, i32** [[A_ADDR]], align 8, !dbg [[DBG268]] +// CHECK-NEXT: [[TMP2:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[B_ADDR]], align 8, !dbg [[DBG268]] +// CHECK-NEXT: [[TMP3:%.*]] = load i8*, i8** [[BB_ADDR]], align 8, !dbg [[DBG268]] +// CHECK-NEXT: [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG268]] +// CHECK-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG268]] +// CHECK-NEXT: [[TMP6:%.*]] = load [10 x [10 x [10 x i32]]]*, [10 x [10 x [10 x i32]]]** [[C_ADDR]], align 8, !dbg [[DBG268]] +// CHECK-NEXT: [[TMP7:%.*]] = load i32*, i32** [[A_ADDR]], align 8, !dbg [[DBG268]] +// CHECK-NEXT: [[TMP8:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[B_ADDR]], align 8, !dbg [[DBG268]] +// CHECK-NEXT: [[TMP9:%.*]] = load i8*, i8** [[BB_ADDR]], align 8, !dbg [[DBG268]] +// CHECK-NEXT: [[TMP10:%.*]] = addrspacecast [10 x [10 x [10 x i32]]]* [[TMP6]] to [10 x [10 x [10 x i32]]] addrspace(1)*, !dbg [[DBG268]] +// CHECK-NEXT: [[TMP11:%.*]] = addrspacecast i32* [[TMP7]] to i32 addrspace(1)*, !dbg [[DBG268]] +// CHECK-NEXT: [[TMP12:%.*]] = addrspacecast [10 x [10 x i32]]* [[TMP8]] to [10 x [10 x i32]] addrspace(1)*, !dbg [[DBG268]] +// CHECK-NEXT: [[TMP13:%.*]] = addrspacecast i8* [[TMP9]] to i8 addrspace(1)*, !dbg [[DBG268]] +// CHECK-NEXT: call void @__omp_outlined___debug__3(i32* [[TMP4]], i32* [[TMP5]], [10 x [10 x [10 x i32]]] addrspace(1)* [[TMP10]], i32 addrspace(1)* [[TMP11]], [10 x [10 x i32]] addrspace(1)* [[TMP12]], i8 addrspace(1)* [[TMP13]]) #[[ATTR4]], !dbg [[DBG268]] +// CHECK-NEXT: ret void, !dbg [[DBG268]] // // // CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l51 -// CHECK-SAME: ([10 x [10 x [10 x i32]]]* nonnull align 4 dereferenceable(4000) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[B:%.*]], i8* nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG268:![0-9]+]] { +// CHECK-SAME: ([10 x [10 x [10 x i32]]]* nonnull align 4 dereferenceable(4000) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[B:%.*]], i8* nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG269:![0-9]+]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x [10 x i32]]]*, align 8 // CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 // CHECK-NEXT: [[B_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 // CHECK-NEXT: [[BB_ADDR:%.*]] = alloca i8*, align 8 // CHECK-NEXT: store [10 x [10 x [10 x i32]]]* [[C]], [10 x [10 x [10 x i32]]]** [[C_ADDR]], align 8 -// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x [10 x i32]]]** [[C_ADDR]], metadata [[META271:![0-9]+]], metadata !DIExpression()), !dbg [[DBG272:![0-9]+]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x [10 x i32]]]** [[C_ADDR]], metadata [[META272:![0-9]+]], metadata !DIExpression()), !dbg [[DBG273:![0-9]+]] // CHECK-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 -// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[A_ADDR]], metadata [[META273:![0-9]+]], metadata !DIExpression()), !dbg [[DBG272]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i32** [[A_ADDR]], metadata [[META274:![0-9]+]], metadata !DIExpression()), !dbg [[DBG273]] // CHECK-NEXT: store [10 x [10 x i32]]* [[B]], [10 x [10 x i32]]** [[B_ADDR]], align 8 -// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x i32]]** [[B_ADDR]], metadata [[META274:![0-9]+]], metadata !DIExpression()), !dbg [[DBG272]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata [10 x [10 x i32]]** [[B_ADDR]], metadata [[META275:![0-9]+]], metadata !DIExpression()), !dbg [[DBG273]] // CHECK-NEXT: store i8* [[BB]], i8** [[BB_ADDR]], align 8 -// CHECK-NEXT: call void @llvm.dbg.declare(metadata i8** [[BB_ADDR]], metadata [[META275:![0-9]+]], metadata !DIExpression()), !dbg [[DBG272]] -// CHECK-NEXT: [[TMP0:%.*]] = load [10 x [10 x [10 x i32]]]*, [10 x [10 x [10 x i32]]]** [[C_ADDR]], align 8, !dbg [[DBG276:![0-9]+]] -// CHECK-NEXT: [[TMP1:%.*]] = load i32*, i32** [[A_ADDR]], align 8, !dbg [[DBG276]] -// CHECK-NEXT: [[TMP2:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[B_ADDR]], align 8, !dbg [[DBG276]] -// CHECK-NEXT: [[TMP3:%.*]] = load i8*, i8** [[BB_ADDR]], align 8, !dbg [[DBG276]] -// CHECK-NEXT: [[TMP4:%.*]] = load [10 x [10 x [10 x i32]]]*, [10 x [10 x [10 x i32]]]** [[C_ADDR]], align 8, !dbg [[DBG276]] -// CHECK-NEXT: [[TMP5:%.*]] = load i32*, i32** [[A_ADDR]], align 8, !dbg [[DBG276]] -// CHECK-NEXT: [[TMP6:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[B_ADDR]], align 8, !dbg [[DBG276]] -// CHECK-NEXT: [[TMP7:%.*]] = load i8*, i8** [[BB_ADDR]], align 8, !dbg [[DBG276]] -// CHECK-NEXT: [[TMP8:%.*]] = addrspacecast [10 x [10 x [10 x i32]]]* [[TMP4]] to [10 x [10 x [10 x i32]]] addrspace(1)*, !dbg [[DBG276]] -// CHECK-NEXT: [[TMP9:%.*]] = addrspacecast i32* [[TMP5]] to i32 addrspace(1)*, !dbg [[DBG276]] -// CHECK-NEXT: [[TMP10:%.*]] = addrspacecast [10 x [10 x i32]]* [[TMP6]] to [10 x [10 x i32]] addrspace(1)*, !dbg [[DBG276]] -// CHECK-NEXT: [[TMP11:%.*]] = addrspacecast i8* [[TMP7]] to i8 addrspace(1)*, !dbg [[DBG276]] -// CHECK-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l51_debug__([10 x [10 x [10 x i32]]] addrspace(1)* [[TMP8]], i32 addrspace(1)* [[TMP9]], [10 x [10 x i32]] addrspace(1)* [[TMP10]], i8 addrspace(1)* [[TMP11]]) #[[ATTR4]], !dbg [[DBG276]] -// CHECK-NEXT: ret void, !dbg [[DBG276]] +// CHECK-NEXT: call void @llvm.dbg.declare(metadata i8** [[BB_ADDR]], metadata [[META276:![0-9]+]], metadata !DIExpression()), !dbg [[DBG273]] +// CHECK-NEXT: [[TMP0:%.*]] = load [10 x [10 x [10 x i32]]]*, [10 x [10 x [10 x i32]]]** [[C_ADDR]], align 8, !dbg [[DBG277:![0-9]+]] +// CHECK-NEXT: [[TMP1:%.*]] = load i32*, i32** [[A_ADDR]], align 8, !dbg [[DBG277]] +// CHECK-NEXT: [[TMP2:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[B_ADDR]], align 8, !dbg [[DBG277]] +// CHECK-NEXT: [[TMP3:%.*]] = load i8*, i8** [[BB_ADDR]], align 8, !dbg [[DBG277]] +// CHECK-NEXT: [[TMP4:%.*]] = load [10 x [10 x [10 x i32]]]*, [10 x [10 x [10 x i32]]]** [[C_ADDR]], align 8, !dbg [[DBG277]] +// CHECK-NEXT: [[TMP5:%.*]] = load i32*, i32** [[A_ADDR]], align 8, !dbg [[DBG277]] +// CHECK-NEXT: [[TMP6:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[B_ADDR]], align 8, !dbg [[DBG277]] +// CHECK-NEXT: [[TMP7:%.*]] = load i8*, i8** [[BB_ADDR]], align 8, !dbg [[DBG277]] +// CHECK-NEXT: [[TMP8:%.*]] = addrspacecast [10 x [10 x [10 x i32]]]* [[TMP4]] to [10 x [10 x [10 x i32]]] addrspace(1)*, !dbg [[DBG277]] +// CHECK-NEXT: [[TMP9:%.*]] = addrspacecast i32* [[TMP5]] to i32 addrspace(1)*, !dbg [[DBG277]] +// CHECK-NEXT: [[TMP10:%.*]] = addrspacecast [10 x [10 x i32]]* [[TMP6]] to [10 x [10 x i32]] addrspace(1)*, !dbg [[DBG277]] +// CHECK-NEXT: [[TMP11:%.*]] = addrspacecast i8* [[TMP7]] to i8 addrspace(1)*, !dbg [[DBG277]] +// CHECK-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l51_debug__([10 x [10 x [10 x i32]]] addrspace(1)* [[TMP8]], i32 addrspace(1)* [[TMP9]], [10 x [10 x i32]] addrspace(1)* [[TMP10]], i8 addrspace(1)* [[TMP11]]) #[[ATTR4]], !dbg [[DBG277]] +// CHECK-NEXT: ret void, !dbg [[DBG277]] // diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def --- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def +++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def @@ -429,20 +429,14 @@ GlobalListPtr, GlobalListPtr, GlobalListPtr, GlobalListPtr) __OMP_RTL(__kmpc_shuffle_int64, false, Int64, Int64, Int16, Int16) -__OMP_RTL(__kmpc_data_sharing_init_stack, false, Void, ) -__OMP_RTL(__kmpc_data_sharing_init_stack_spmd, false, Void, ) -__OMP_RTL(__kmpc_data_sharing_coalesced_push_stack, false, VoidPtr, SizeTy, Int16) -__OMP_RTL(__kmpc_data_sharing_push_stack, false, VoidPtr, SizeTy, Int16) -__OMP_RTL(__kmpc_data_sharing_pop_stack, false, Void, VoidPtr) +__OMP_RTL(__kmpc_alloc_shared, false, VoidPtr, SizeTy) +__OMP_RTL(__kmpc_free_shared, false, Void, VoidPtr) __OMP_RTL(__kmpc_begin_sharing_variables, false, Void, VoidPtrPtrPtr, SizeTy) __OMP_RTL(__kmpc_end_sharing_variables, false, Void, ) __OMP_RTL(__kmpc_get_shared_variables, false, Void, VoidPtrPtrPtr) __OMP_RTL(__kmpc_parallel_level, false, Int16, IdentPtr, Int32) __OMP_RTL(__kmpc_is_spmd_exec_mode, false, Int8, ) -__OMP_RTL(__kmpc_get_team_static_memory, false, Void, Int16, VoidPtr, SizeTy, - Int16, VoidPtrPtr) -__OMP_RTL(__kmpc_restore_team_static_memory, false, Void, Int16, Int16) __OMP_RTL(__kmpc_barrier_simple_spmd, false, Void, IdentPtr, Int32) __OMP_RTL(__kmpc_warp_active_thread_mask, false, LanemaskTy,) diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -1149,9 +1149,8 @@ } void analysisGlobalization() { - RuntimeFunction GlobalizationRuntimeIDs[] = { - OMPRTL___kmpc_data_sharing_coalesced_push_stack, - OMPRTL___kmpc_data_sharing_push_stack}; + RuntimeFunction GlobalizationRuntimeIDs[] = {OMPRTL___kmpc_alloc_shared, + OMPRTL___kmpc_free_shared}; for (const auto GlobalizationCallID : GlobalizationRuntimeIDs) { auto &RFI = OMPInfoCache.RFIs[GlobalizationCallID]; diff --git a/llvm/test/Transforms/OpenMP/globalization_remarks.ll b/llvm/test/Transforms/OpenMP/globalization_remarks.ll --- a/llvm/test/Transforms/OpenMP/globalization_remarks.ll +++ b/llvm/test/Transforms/OpenMP/globalization_remarks.ll @@ -2,144 +2,41 @@ ; ModuleID = 'declare_target_codegen_globalization.cpp' source_filename = "declare_target_codegen_globalization.cpp" target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" -target triple = "nvptx64-nvidia-cuda" +target triple = "nvptx64" -%struct.ident_t = type { i32, i32, i32, i32, i8* } -%struct._globalized_locals_ty = type { [32 x i32] } +; CHECK: remark: globalization_remarks.c:5:7: Found thread data sharing on the GPU. Expect degraded performance due to data globalization. -@0 = private unnamed_addr constant [56 x i8] c";declare_target_codegen_globalization.cpp;maini1;17;1;;\00", align 1 -@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 1, i32 0, i8* getelementptr inbounds ([56 x i8], [56 x i8]* @0, i32 0, i32 0) }, align 8 -@__omp_offloading_801_3022563__Z6maini1v_l17_exec_mode = weak constant i8 0 -@llvm.compiler.used = appending global [1 x i8*] [i8* @__omp_offloading_801_3022563__Z6maini1v_l17_exec_mode], section "llvm.metadata" +@S = external local_unnamed_addr global i8* -; CHECK: remark: declare_target_codegen_globalization.cpp:17:1: Found thread data sharing on the GPU. Expect degraded performance due to data globalization. -; CHECK: remark: declare_target_codegen_globalization.cpp:10:1: Found thread data sharing on the GPU. Expect degraded performance due to data globalization. - -; Function Attrs: norecurse nounwind -define weak void @__omp_offloading_801_3022563__Z6maini1v_l17(i32* nonnull align 4 dereferenceable(4) %a) local_unnamed_addr #0 !dbg !10 { +define void @foo() { entry: - %nvptx_num_threads = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !dbg !12, !range !13 - tail call void @__kmpc_spmd_kernel_init(i32 %nvptx_num_threads, i16 1, i16 0) #4, !dbg !12 - tail call void @__kmpc_data_sharing_init_stack_spmd() #4, !dbg !12 - %0 = tail call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1) - %1 = tail call i8 @__kmpc_is_spmd_exec_mode() #4 - %.not.i.i = icmp eq i8 %1, 0 - br i1 %.not.i.i, label %.non-spmd2.i.i, label %__omp_outlined__.exit - -.non-spmd2.i.i: ; preds = %entry - %2 = tail call i8* @__kmpc_data_sharing_coalesced_push_stack(i64 128, i16 0) #4, !dbg !12 - tail call void @__kmpc_data_sharing_pop_stack(i8* %2) #4, !dbg !14 - br label %__omp_outlined__.exit, !dbg !14 - -__omp_outlined__.exit: ; preds = %entry, %.non-spmd2.i.i - tail call void @__kmpc_spmd_kernel_deinit_v2(i16 1) #4, !dbg !19 - ret void, !dbg !20 + %0 = call i8* @__kmpc_alloc_shared(i64 4), !dbg !8 + %x_on_stack = bitcast i8* %0 to i32* + %1 = bitcast i32* %x_on_stack to i8* + call void @share(i8* %1) + call void @__kmpc_free_shared(i8* %0) + ret void } -; Function Attrs: nounwind readnone -declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #1 - -declare void @__kmpc_spmd_kernel_init(i32, i16, i16) local_unnamed_addr - -declare void @__kmpc_data_sharing_init_stack_spmd() local_unnamed_addr - -; Function Attrs: norecurse nounwind readonly -define hidden i32 @_Z3fooRi(i32* nocapture nonnull readonly align 4 dereferenceable(4) %a) local_unnamed_addr #2 !dbg !21 { +define void @share(i8* %x) { entry: - %0 = load i32, i32* %a, align 4, !dbg !22, !tbaa !23 - ret i32 %0, !dbg !27 + store i8* %x, i8** @S + ret void } -; Function Attrs: nounwind -define hidden i32 @_Z3barv() local_unnamed_addr #3 !dbg !15 { -entry: - %a1 = alloca i32, align 4 - %0 = tail call i8 @__kmpc_is_spmd_exec_mode() #4 - %.not = icmp eq i8 %0, 0 - br i1 %.not, label %.non-spmd, label %.exit - -.non-spmd: ; preds = %entry - %1 = tail call i8* @__kmpc_data_sharing_push_stack(i64 128, i16 0) #4, !dbg !31 - %2 = bitcast i8* %1 to %struct._globalized_locals_ty* - br label %.exit +declare i8* @__kmpc_alloc_shared(i64) -.exit: ; preds = %entry, %.non-spmd - %_select_stack = phi %struct._globalized_locals_ty* [ %2, %.non-spmd ], [ null, %entry ] - %nvptx_tid = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !28 - %nvptx_lane_id = and i32 %nvptx_tid, 31 - %3 = zext i32 %nvptx_lane_id to i64 - %4 = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* %_select_stack, i64 0, i32 0, i64 %3 - %5 = select i1 %.not, i32* %4, i32* %a1 - %6 = load i32, i32* %5, align 4, !dbg !29, !tbaa !23 - br i1 %.not, label %.non-spmd2, label %.exit3, !dbg !31 +declare void @__kmpc_free_shared(i8*) -.non-spmd2: ; preds = %.exit - %7 = bitcast %struct._globalized_locals_ty* %_select_stack to i8*, !dbg !31 - tail call void @__kmpc_data_sharing_pop_stack(i8* %7) #4, !dbg !31 - br label %.exit3, !dbg !31 - -.exit3: ; preds = %.non-spmd2, %.exit - ret i32 %6, !dbg !31 -} - -declare i8 @__kmpc_is_spmd_exec_mode() local_unnamed_addr - -declare i8* @__kmpc_data_sharing_coalesced_push_stack(i64, i16) local_unnamed_addr - -declare i8* @__kmpc_data_sharing_push_stack(i64, i16) local_unnamed_addr - -; Function Attrs: nounwind readnone -declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 - -declare void @__kmpc_data_sharing_pop_stack(i8*) local_unnamed_addr - -; Function Attrs: nounwind -declare i32 @__kmpc_global_thread_num(%struct.ident_t*) local_unnamed_addr #4 - -declare void @__kmpc_spmd_kernel_deinit_v2(i16) local_unnamed_addr - -attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx32,+sm_35" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind readnone } -attributes #2 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx32,+sm_35" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #3 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx32,+sm_35" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #4 = { nounwind } !llvm.dbg.cu = !{!0} -!omp_offload.info = !{!3} -!nvvm.annotations = !{!4} -!llvm.module.flags = !{!5, !6, !7, !8} -!llvm.ident = !{!9} +!llvm.module.flags = !{!3, !4} -!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 12.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: DebugDirectivesOnly, enums: !2, splitDebugInlining: false, nameTableKind: None) -!1 = !DIFile(filename: "declare_target_codegen_globalization.cpp", directory: "/home/jhuber/Documents/llvm-project/clang/test/OpenMP") +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 12.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None) +!1 = !DIFile(filename: "globalization_remarks.c", directory: "/tmp/globalization_remarks.c") !2 = !{} -!3 = !{i32 0, i32 2049, i32 50472291, !"_Z6maini1v", i32 17, i32 0} -!4 = !{void (i32*)* @__omp_offloading_801_3022563__Z6maini1v_l17, !"kernel", i32 1} -!5 = !{i32 7, !"Dwarf Version", i32 2} -!6 = !{i32 2, !"Debug Info Version", i32 3} -!7 = !{i32 1, !"wchar_size", i32 4} -!8 = !{i32 7, !"PIC Level", i32 2} -!9 = !{!"clang version 12.0.0"} -!10 = distinct !DISubprogram(name: "__omp_offloading_801_3022563__Z6maini1v_l17", scope: !1, file: !1, line: 17, type: !11, scopeLine: 17, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2) -!11 = !DISubroutineType(types: !2) -!12 = !DILocation(line: 17, column: 1, scope: !10) -!13 = !{i32 1, i32 1025} -!14 = !DILocation(line: 10, column: 1, scope: !15, inlinedAt: !16) -!15 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 7, type: !11, scopeLine: 7, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2) -!16 = distinct !DILocation(line: 20, column: 18, scope: !17, inlinedAt: !18) -!17 = distinct !DISubprogram(name: "__omp_outlined__", scope: !1, file: !1, line: 17, type: !11, scopeLine: 17, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2) -!18 = distinct !DILocation(line: 17, column: 1, scope: !10) -!19 = !DILocation(line: 17, column: 40, scope: !10) -!20 = !DILocation(line: 21, column: 3, scope: !10) -!21 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 5, type: !11, scopeLine: 5, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2) -!22 = !DILocation(line: 5, column: 26, scope: !21) -!23 = !{!24, !24, i64 0} -!24 = !{!"int", !25, i64 0} -!25 = !{!"omnipotent char", !26, i64 0} -!26 = !{!"Simple C++ TBAA"} -!27 = !DILocation(line: 5, column: 19, scope: !21) -!28 = !{i32 0, i32 1024} -!29 = !DILocation(line: 5, column: 26, scope: !21, inlinedAt: !30) -!30 = distinct !DILocation(line: 9, column: 10, scope: !15) -!31 = !DILocation(line: 10, column: 1, scope: !15) - +!3 = !{i32 2, !"Debug Info Version", i32 3} +!4 = !{i32 1, !"wchar_size", i32 4} +!6 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !7, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2) +!7 = !DISubroutineType(types: !2) +!8 = !DILocation(line: 5, column: 7, scope: !6) diff --git a/llvm/test/Transforms/PhaseOrdering/openmp-opt-module.ll b/llvm/test/Transforms/PhaseOrdering/openmp-opt-module.ll --- a/llvm/test/Transforms/PhaseOrdering/openmp-opt-module.ll +++ b/llvm/test/Transforms/PhaseOrdering/openmp-opt-module.ll @@ -7,11 +7,11 @@ define void @foo() { entry: - %x = call i8* @__kmpc_data_sharing_push_stack(i64 4, i16 0), !dbg !7 + %x = call i8* @__kmpc_alloc_shared(i64 4), !dbg !7 %x_on_stack = bitcast i8* %x to i32* %0 = bitcast i32* %x_on_stack to i8* call void @use(i8* %0) - call void @__kmpc_data_sharing_pop_stack(i8* %x) + call void @__kmpc_free_shared(i8* %x) ret void } @@ -22,7 +22,7 @@ ret void } -define internal i8* @__kmpc_data_sharing_push_stack(i64 %DataSize, i16 %shared) { +define internal i8* @__kmpc_alloc_shared(i64 %DataSize) { entry: %call = call i8* @_Z10SafeMallocmPKc(i64 %DataSize, i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str, i64 0, i64 0)) #11 ret i8* %call @@ -31,7 +31,7 @@ ; Function Attrs: convergent nounwind mustprogress declare i8* @_Z10SafeMallocmPKc(i64 %size, i8* nocapture readnone %msg) -declare void @__kmpc_data_sharing_pop_stack(i8*) +declare void @__kmpc_free_shared(i8*) !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!3, !4} diff --git a/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu b/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu --- a/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu @@ -24,187 +24,16 @@ // Runtime functions for trunk data sharing scheme. //////////////////////////////////////////////////////////////////////////////// -INLINE static void data_sharing_init_stack_common() { - ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized."); - omptarget_nvptx_TeamDescr *teamDescr = - &omptarget_nvptx_threadPrivateContext->TeamContext(); - - for (int WID = 0; WID < DS_Max_Warp_Number; WID++) { - __kmpc_data_sharing_slot *RootS = teamDescr->GetPreallocatedSlotAddr(WID); - DataSharingState.SlotPtr[WID] = RootS; - DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0]; - } +// Allocate memory that can be shared between the threads. +// TODO: Add a small buffer of shared memory to allocate memory from +// TODO: Add an INFO message to communicate with the user +EXTERN void *__kmpc_alloc_shared(size_t DataSize) { + return (void *)SafeMalloc(DataSize, "Alloc Shared"); } -// Initialize data sharing data structure. This function needs to be called -// once at the beginning of a data sharing context (coincides with the kernel -// initialization). This function is called only by the MASTER thread of each -// team in non-SPMD mode. -EXTERN void __kmpc_data_sharing_init_stack() { - ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized."); - // This function initializes the stack pointer with the pointer to the - // statically allocated shared memory slots. The size of a shared memory - // slot is pre-determined to be 256 bytes. - data_sharing_init_stack_common(); - omptarget_nvptx_globalArgs.Init(); -} - -// Initialize data sharing data structure. This function needs to be called -// once at the beginning of a data sharing context (coincides with the kernel -// initialization). This function is called in SPMD mode only. -EXTERN void __kmpc_data_sharing_init_stack_spmd() { - ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized."); - // This function initializes the stack pointer with the pointer to the - // statically allocated shared memory slots. The size of a shared memory - // slot is pre-determined to be 256 bytes. - if (GetThreadIdInBlock() == 0) - data_sharing_init_stack_common(); - - __kmpc_impl_threadfence_block(); -} - -INLINE static void *data_sharing_push_stack_common(size_t PushSize) { - ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime."); - - // Only warp active master threads manage the stack. - bool IsWarpMaster = (GetThreadIdInBlock() % WARPSIZE) == 0; - - // Add worst-case padding to DataSize so that future stack allocations are - // correctly aligned. - const size_t Alignment = 8; - PushSize = (PushSize + (Alignment - 1)) / Alignment * Alignment; - - // Frame pointer must be visible to all workers in the same warp. - const unsigned WID = GetWarpId(); - void *FrameP = 0; - __kmpc_impl_lanemask_t CurActive = __kmpc_impl_activemask(); - - if (IsWarpMaster) { - // SlotP will point to either the shared memory slot or an existing - // global memory slot. - __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID]; - void *&StackP = DataSharingState.StackPtr[WID]; - - // Check if we have room for the data in the current slot. - const uintptr_t StartAddress = (uintptr_t)StackP; - const uintptr_t EndAddress = (uintptr_t)SlotP->DataEnd; - const uintptr_t RequestedEndAddress = StartAddress + (uintptr_t)PushSize; - - // If we requested more data than there is room for in the rest - // of the slot then we need to either re-use the next slot, if one exists, - // or create a new slot. - if (EndAddress < RequestedEndAddress) { - __kmpc_data_sharing_slot *NewSlot = 0; - size_t NewSize = PushSize; - - // Allocate at least the default size for each type of slot. - // Master is a special case and even though there is only one thread, - // it can share more things with the workers. For uniformity, it uses - // the full size of a worker warp slot. - size_t DefaultSlotSize = DS_Worker_Warp_Slot_Size; - if (DefaultSlotSize > NewSize) - NewSize = DefaultSlotSize; - NewSlot = (__kmpc_data_sharing_slot *)SafeMalloc( - sizeof(__kmpc_data_sharing_slot) + NewSize, - "Global memory slot allocation."); - - NewSlot->Next = 0; - NewSlot->Prev = SlotP; - NewSlot->PrevSlotStackPtr = StackP; - NewSlot->DataEnd = &NewSlot->Data[0] + NewSize; - - // Make previous slot point to the newly allocated slot. - SlotP->Next = NewSlot; - // The current slot becomes the new slot. - SlotP = NewSlot; - // The stack pointer always points to the next free stack frame. - StackP = &NewSlot->Data[0] + PushSize; - // The frame pointer always points to the beginning of the frame. - FrameP = DataSharingState.FramePtr[WID] = &NewSlot->Data[0]; - } else { - // Add the data chunk to the current slot. The frame pointer is set to - // point to the start of the new frame held in StackP. - FrameP = DataSharingState.FramePtr[WID] = StackP; - // Reset stack pointer to the requested address. - StackP = (void *)RequestedEndAddress; - } - } - // Get address from lane 0. - int *FP = (int *)&FrameP; - FP[0] = __kmpc_impl_shfl_sync(CurActive, FP[0], 0); - if (sizeof(FrameP) == 8) - FP[1] = __kmpc_impl_shfl_sync(CurActive, FP[1], 0); - - return FrameP; -} - -EXTERN void *__kmpc_data_sharing_coalesced_push_stack(size_t DataSize, - int16_t UseSharedMemory) { - return data_sharing_push_stack_common(DataSize); -} - -// Called at the time of the kernel initialization. This is used to initilize -// the list of references to shared variables and to pre-allocate global storage -// for holding the globalized variables. -// -// By default the globalized variables are stored in global memory. If the -// UseSharedMemory is set to true, the runtime will attempt to use shared memory -// as long as the size requested fits the pre-allocated size. -EXTERN void *__kmpc_data_sharing_push_stack(size_t DataSize, - int16_t UseSharedMemory) { - // Compute the total memory footprint of the requested data. - // The master thread requires a stack only for itself. A worker - // thread (which at this point is a warp master) will require - // space for the variables of each thread in the warp, - // i.e. one DataSize chunk per warp lane. - // TODO: change WARPSIZE to the number of active threads in the warp. - size_t PushSize = (isRuntimeUninitialized() || IsMasterThread(isSPMDMode())) - ? DataSize - : WARPSIZE * DataSize; - - // Compute the start address of the frame of each thread in the warp. - uintptr_t FrameStartAddress = - (uintptr_t)data_sharing_push_stack_common(PushSize); - FrameStartAddress += (uintptr_t)(GetLaneId() * DataSize); - return (void *)FrameStartAddress; -} - -// Pop the stack and free any memory which can be reclaimed. -// -// When the pop operation removes the last global memory slot, -// reclaim all outstanding global memory slots since it is -// likely we have reached the end of the kernel. -EXTERN void __kmpc_data_sharing_pop_stack(void *FrameStart) { - ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime."); - - __kmpc_impl_threadfence_block(); - - if (GetThreadIdInBlock() % WARPSIZE == 0) { - unsigned WID = GetWarpId(); - - // Current slot - __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID]; - - // Pointer to next available stack. - void *&StackP = DataSharingState.StackPtr[WID]; - - // Pop the frame. - StackP = FrameStart; - - // If the current slot is empty, we need to free the slot after the - // pop. - bool SlotEmpty = (StackP == &SlotP->Data[0]); - - if (SlotEmpty && SlotP->Prev) { - // Before removing the slot we need to reset StackP. - StackP = SlotP->PrevSlotStackPtr; - - // Remove the slot. - SlotP = SlotP->Prev; - SafeFree(SlotP->Next, "Free slot."); - SlotP->Next = 0; - } - } +// Free the allocated memory. +EXTERN void __kmpc_free_shared(void *FrameStart) { + SafeFree(FrameStart, "Free Shared"); } // Begin a data sharing context. Maintain a list of references to shared @@ -278,4 +107,21 @@ omptarget_nvptx_simpleMemoryManager.Release(); } +// Deprecated globalization code +EXTERN void __kmpc_data_sharing_init_stack() {} +EXTERN void __kmpc_data_sharing_init_stack_spmd() {} + +EXTERN void *__kmpc_data_sharing_coalesced_push_stack(size_t DataSize, + int16_t) { + return (void *)SafeMalloc(DataSize, "Alloc Deprecated"); +} + +EXTERN void *__kmpc_data_sharing_push_stack(size_t DataSize, int16_t) { + return (void *)SafeMalloc(DataSize, "Alloc Deprecated"); +} + +EXTERN void __kmpc_data_sharing_pop_stack(void *FrameStart) { + SafeFree(FrameStart, "Free Shared"); +} + #pragma omp end declare target diff --git a/openmp/libomptarget/deviceRTLs/interface.h b/openmp/libomptarget/deviceRTLs/interface.h --- a/openmp/libomptarget/deviceRTLs/interface.h +++ b/openmp/libomptarget/deviceRTLs/interface.h @@ -425,13 +425,8 @@ EXTERN bool __kmpc_kernel_parallel(void **WorkFn); EXTERN void __kmpc_kernel_end_parallel(); -EXTERN void __kmpc_data_sharing_init_stack(); -EXTERN void __kmpc_data_sharing_init_stack_spmd(); -EXTERN void *__kmpc_data_sharing_coalesced_push_stack(size_t size, - int16_t UseSharedMemory); -EXTERN void *__kmpc_data_sharing_push_stack(size_t size, - int16_t UseSharedMemory); -EXTERN void __kmpc_data_sharing_pop_stack(void *a); +EXTERN void *__kmpc_alloc_shared(size_t Size); +EXTERN void __kmpc_free_shared(void *Data); EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs); EXTERN void __kmpc_end_sharing_variables(); EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs); @@ -462,4 +457,11 @@ EXTERN void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode, int16_t is_shared); +// Deprecated globalization interface +EXTERN void *__kmpc_data_sharing_coalesced_push_stack(size_t size, int16_t s); +EXTERN void *__kmpc_data_sharing_push_stack(size_t size, int16_t s); +EXTERN void __kmpc_data_sharing_pop_stack(void *a); +EXTERN void __kmpc_data_sharing_init_stack(); +EXTERN void __kmpc_data_sharing_init_stack_spmd(); + #endif